jedi-fork/jedi/parser/diff.py

"""
Basically a contains parser that is faster, because it tries to parse only
parts and if anything changes, it only reparses the changed parts.

It works with a simple diff in the beginning and will try to reuse old parser
fragments.
"""
import copy
import re
import difflib

from jedi._compatibility import use_metaclass
from jedi import settings
from jedi.common import splitlines
from jedi.parser import ParserWithRecovery
from jedi.parser.tree import Module, search_ancestor, EndMarker, Flow
from jedi.parser.utils import parser_cache
from jedi import debug
from jedi.parser.tokenize import (generate_tokens, NEWLINE, TokenInfo,
                                  ENDMARKER, INDENT, DEDENT)


class CachedFastParser(type):
    """ This is a metaclass for caching `FastParser`. """
    def __call__(self, grammar, source, module_path=None):
        pi = parser_cache.get(module_path, None)
        if pi is None or not settings.fast_parser:
            return ParserWithRecovery(grammar, source, module_path)

        parser = pi.parser
        d = DiffParser(parser)
        new_lines = splitlines(source, keepends=True)
        parser.module = parser._parsed = d.update(new_lines)
        return parser


class FastParser(use_metaclass(CachedFastParser)):
    pass


def _merge_used_names(base_dict, other_dict):
    for key, names in other_dict.items():
        base_dict.setdefault(key, []).extend(names)


def _get_last_line(node_or_leaf):
    last_leaf = node_or_leaf.last_leaf()
    if last_leaf.type == 'error_leaf':
        typ = last_leaf.original_type
    else:
        typ = last_leaf.type
    if typ == 'newline':
        return last_leaf.start_pos[0]
    else:
        return last_leaf.end_pos[0]


def _flows_finished(grammar, stack):
    """
    if, while, for and try might not be finished, because another part might
    still be parsed.
    """
    for dfa, newstate, (symbol_number, nodes) in stack:
        if grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt',
                                                    'for_stmt', 'try_stmt'):
            return False
    return True


def suite_or_file_input_is_valid(grammar, stack):
    if not _flows_finished(grammar, stack):
        return False

    for dfa, newstate, (symbol_number, nodes) in reversed(stack):
        if grammar.number2symbol[symbol_number] == 'suite':
            # If only newline is in the suite, the suite is not valid, yet.
            return len(nodes) > 1
    # Not reaching a suite means that we're dealing with file_input levels
    # where there's no need for a valid statement in it. It can also be empty.
    return True


def _is_flow_node(node):
    try:
        value = node.children[0].value
    except AttributeError:
        return False
    return value in ('if', 'for', 'while', 'try')


def _last_leaf_is_newline(last_leaf):
    if last_leaf.prefix.endswith('\n'):
        return True
    if last_leaf.prefix:
        return False
    previous_leaf = last_leaf.get_previous_leaf()
    return (previous_leaf.type == 'newline' or
            previous_leaf.type == 'error_leaf' and
            previous_leaf.original_type == 'newline')


def _update_positions(nodes, line_offset):
    for node in nodes:
        try:
            children = node.children
        except AttributeError:
            # Is a leaf
            node.start_pos = node.start_pos[0] + line_offset, node.start_pos[1]
        else:
            _update_positions(children, line_offset)


class DiffParser(object):
    endmarker_type = 'endmarker'

    def __init__(self, parser):
        self._parser = parser
        self._grammar = self._parser._grammar
        self._old_module = parser.get_root_node()

    def _reset(self):
        self._copy_count = 0
        self._parser_count = 0

        self._parsed_until_line = 0
        self._copied_ranges = []

        self._old_children = self._old_module.children
        self._new_children = []
        self._new_module = Module(self._new_children)
        self._new_module.path = self._old_module.path
        self._new_module.used_names = {}
        self._prefix = ''
        self._last_prefix = ''

    def update(self, lines_new):
        '''
        The algorithm works as follows:

        Equal:
            - Assure that the start is a newline, otherwise parse until we get
              one.
            - Copy from parsed_until_line + 1 to max(i2 + 1)
            - Make sure that the indentation is correct (e.g. add DEDENT)
            - Add old and change positions
        Insert:
            - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not
              much more.
        Always:
            - Set parsed_until_line

        Returns the new module node.
        '''
        self._parser_lines_new = lines_new
        self._added_newline = False
        if lines_new[-1] != '':
            # The Python grammar needs a newline at the end of a file, but for
            # everything else we keep working with lines_new here.
            self._parser_lines_new = list(lines_new)
            self._parser_lines_new[-1] += '\n'
            self._added_newline = True

        self._reset()

        line_length = len(lines_new)
        lines_old = splitlines(self._parser.source, keepends=True)
        sm = difflib.SequenceMatcher(None, lines_old, self._parser_lines_new)
        debug.dbg('diff: line_lengths old: %s, new: %s' % (len(lines_old), line_length))
        for operation, i1, i2, j1, j2 in sm.get_opcodes():
            debug.dbg('diff %s old[%s:%s] new[%s:%s]',
                      operation, i1 + 1, i2, j1 + 1, j2)

            if j2 == line_length + int(self._added_newline):
                # The empty part after the last newline is not relevant.
                j2 -= 1

            if operation == 'equal':
                line_offset = j1 - i1
                self._copy_from_old_parser(line_offset, i2, j2)
            elif operation == 'replace':
                self._parse(until_line=j2)
            elif operation == 'insert':
                self._parse(until_line=j2)
            else:
                assert operation == 'delete'

        # Cleanup (setting endmarker, used_names)
        self._cleanup()
        if self._added_newline:
            self._parser.module = self._parser._parsed = self._new_module
            self._parser.remove_last_newline()
            self._parsed_until_line -= 1

        self._parser.source = ''.join(lines_new)
        self._old_module = self._new_module

        assert self._new_module.end_pos[0] == line_length

        return self._new_module

    def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new):
        while until_line_new > self._parsed_until_line:
            parsed_until_line_old = self._parsed_until_line - line_offset
            line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1)
            if line_stmt is None:
                # Parse 1 line at least. We don't need more, because we just
                # want to get into a state where the old parser has statements
                # again that can be copied (e.g. not lines within parentheses).
                self._parse(self._parsed_until_line + 1)
            else:
                p_children = line_stmt.parent.children
                index = p_children.index(line_stmt)

                # Match all the nodes that are in the wanted range.
                nodes = self._divide_nodes(p_children[index:], until_line_old)
                if nodes:
                    self._copy_count += 1
                    _update_positions(nodes, line_offset)
                    self._insert_nodes(nodes)

                    from_ = nodes[0].get_start_pos_of_prefix()[0]
                    to = _get_last_line(nodes[-1])
                    self._copied_ranges.append((from_, to))

                    debug.dbg('diff actually copy %s to %s', from_, to)
                # We have copied as much as possible (but definitely not too
                # much). Therefore we just parse the rest.
                # We might not reach the end, because there's a statement
                # that is not finished.
                self._parse(until_line_new)
                break

    def _get_old_line_stmt(self, old_line):
        leaf = self._old_module.get_leaf_for_position((old_line, 0), include_prefixes=True)

        if leaf.type == 'newline':
            leaf = leaf.get_next_leaf()
        if leaf.get_start_pos_of_prefix()[0] == old_line:
            node = leaf
            # TODO use leaf.get_definition one day when that one is working
            # well.
            while node.parent.type not in ('file_input', 'suite'):
                node = node.parent
            return node
        # Must be on the same line. Otherwise we need to parse that bit.
        return None

    def _insert_nodes(self, nodes):
        """
        Returns the scope that a node is a part of.
        """
        # Needs to be done before resetting the parsed
        before_node = self._get_before_insertion_node()

        last_leaf = nodes[-1].last_leaf()
        is_endmarker = last_leaf.type == self.endmarker_type
        self._last_prefix = ''
        if is_endmarker:
            self._parsed_until_line = last_leaf.start_pos[0]
            try:
                separation = last_leaf.prefix.rindex('\n')
            except ValueError:
                pass
            else:
                # Remove the whitespace part of the prefix after a newline.
                # That is not relevant if parentheses were opened. Always parse
                # until the end of a line.
                last_leaf.prefix, self._last_prefix = \
                    last_leaf.prefix[:separation + 1], last_leaf.prefix[separation + 1:]

            if _last_leaf_is_newline(last_leaf):
                self._parsed_until_line -= 1
        else:
            if last_leaf.type == 'newline':
                # Newlines end on the next line, which means that they would cover
                # the next line. That line is not fully parsed at this point.
                self._parsed_until_line = last_leaf.start_pos[0]
            else:
                self._parsed_until_line = last_leaf.end_pos[0]
        debug.dbg('set parsed_until %s', self._parsed_until_line)

        first_leaf = nodes[0].first_leaf()
        first_leaf.prefix = self._prefix + first_leaf.prefix
        self._prefix = ''

        if is_endmarker:
            self._prefix = last_leaf.prefix

            nodes = nodes[:-1]
            if not nodes:
                return self._new_module

        # Now the preparations are done. We are inserting the nodes.
        if before_node is None:  # Everything is empty.
            self._new_children += nodes
            new_parent = self._new_module
        else:
            assert nodes[0].type != 'newline'
            line_indentation = nodes[0].start_pos[1]
            new_parent = before_node.parent
            while True:
                p_children = new_parent.children
                if new_parent.type == 'suite':
                    # A suite starts with NEWLINE, ...
                    indentation = p_children[1].start_pos[1]
                else:
                    indentation = p_children[0].start_pos[1]

                if line_indentation < indentation:  # Dedent
                    # We might be at the most outer layer: modules. We
                    # don't want to depend on the first statement
                    # having the right indentation.
                    if new_parent.parent is not None:
                        new_parent = search_ancestor(
                            new_parent,
                            ('suite', 'file_input')
                        )
                        continue

                p_children += nodes
                assert new_parent.type in ('suite', 'file_input')
                break

        # Reset the parents
        for node in nodes:
            node.parent = new_parent
        if new_parent.type == 'suite':
            return new_parent.get_parent_scope()

        return new_parent

    def _get_before_insertion_node(self):
        if not self._new_children:
            return None

        line = self._parsed_until_line + 1
        node = self._new_module.last_leaf()
        while True:
            parent = node.parent
            if parent.type in ('suite', 'file_input'):
                assert node.end_pos[0] <= line
                assert node.end_pos[1] == 0 or '\n' in self._prefix
                return node
            node = parent

    def _divide_node(self, node, until_line):
        if node.type not in ('classdef', 'funcdef'):
            return None

        suite = node.children[-1]
        if suite.type != 'suite':
            return None

        nodes = self._divide_nodes(suite.children, until_line)

        if len(nodes) < 2:
            # A suite only with newline is not valid.
            return None

        new_node = copy.copy(node)
        new_suite = copy.copy(suite)

        # And now set the correct parents
        for child in nodes:
            child.parent = new_suite
        new_suite.children = nodes

        new_node.children = list(new_node.children)
        new_node.children[-1] = new_suite
        for child in new_node.children:
            child.parent = new_node
        return new_node

    def _copy_divided_nodes(self, nodes):
        parent = nodes[-1].last_leaf().get_parent_scope()
        if parent == nodes[0].get_parent_scope():
            check_nodes = nodes
        else:
            n = parent
            while n is not None:
                if isinstance(n, Flow):
                    parent = n.get_parent_scope()
                n = n.parent
            check_nodes = parent.children

        last_node = check_nodes[-1]

        if last_node.type == 'suite':
            parent = last_node
            check_nodes = parent.children
            last_node = check_nodes[-1]

        drop_node_count = 0
        if last_node.type in ('error_leaf', 'error_node') or _is_flow_node(last_node):
            # Error leafs/nodes don't have a defined start/end. Error
            # nodes might not end with a newline (e.g. if there's an
            # open `(`). Therefore ignore all of them unless they are
            # succeeded with valid parser state.
            # If we copy flows at the end, they might be continued
            # after the copy limit (in the new parser).
            n = last_node
            # In this while loop we try to remove until we find a newline.
            while True:
                drop_node_count += 1
                try:
                    n = check_nodes[-drop_node_count - 1]
                except IndexError:
                    break
                if n.last_leaf().type == 'newline':
                    break

        if drop_node_count:
            node = self._drop_last_node(nodes[-1], last_node, drop_node_count)
            if node is None:
                nodes = nodes[:-drop_node_count]
            else:
                nodes[-1] = node
        return nodes

    def _drop_last_node(self, base_node, last_node_to_drop, drop_node_count):
        if base_node == last_node_to_drop:
            return None

        last_node = base_node.children[-1]
        child = self._drop_last_node(last_node, last_node_to_drop, drop_node_count)

        base_node = copy.copy(base_node)
        if child is None:
            if base_node.type == 'suite' and len(base_node.children) <= 1 + drop_node_count:
                return None
            if base_node.type in ('classdef', 'funcdef'):
                return None

            base_node.children = base_node.children[:-drop_node_count]
        else:
            base_node.children = list(base_node.children)
            base_node.children[-1] = child
            child.parent = base_node

        for c in base_node.children:
            c.parent = base_node
        return base_node

    def _divide_nodes(self, nodes, until_line):
        """
        Breaks up scopes and returns only the part until the given line.

        Tries to get the parts it can safely get and ignores the rest.
        """
        new_nodes = []
        for i, child in enumerate(nodes):
            # TODO this check might take a bit of time for large files. We
            # might want to change this to do more intelligent guessing or
            # binary search.
            if _get_last_line(child) > until_line:
                node = self._divide_node(child, until_line)
                if node is not None:
                    new_nodes.append(node)
                break
            else:
                new_nodes.append(child)

        if new_nodes:
            return self._copy_divided_nodes(new_nodes)
        return new_nodes

    def _parse(self, until_line):
        """
        Parses at least until the given line, but might just parse more until a
        valid state is reached.
        """
        while until_line > self._parsed_until_line:
            node = self._try_parse_part(until_line)
            nodes = self._get_children_nodes(node)
            self._insert_nodes(nodes)
            _merge_used_names(
                self._new_module.used_names,
                node.used_names
            )

    def _get_children_nodes(self, node):
        nodes = node.children
        first_element = nodes[0]
        # TODO this looks very strange...
        if first_element.type == 'error_leaf' and \
                first_element.original_type == 'indent':
            assert False, str(nodes)
            #assert nodes[-1].type == 'dedent'
            ## This means that the start and end leaf
            #nodes = nodes[1:-1] + [nodes[-1]]

        return nodes

    def _try_parse_part(self, until_line):
        """
        Sets up a normal parser that uses a spezialized tokenizer to only parse
        until a certain position (or a bit longer if the statement hasn't
        ended.
        """
        self._parser_count += 1
        # TODO speed up, shouldn't copy the whole list all the time.
        # memoryview?
        lines_after = self._parser_lines_new[self._parsed_until_line:]
        #print('parse_content', self._parsed_until_line, lines_after, until_line)
        tokenizer = self._diff_tokenize(
            lines_after,
            until_line,
            line_offset=self._parsed_until_line
        )
        self._active_parser = ParserWithRecovery(
            self._grammar,
            source='\n',
            start_parsing=False
        )
        return self._active_parser.parse(tokenizer=tokenizer)

    def _cleanup(self):
        """Add used names and an end marker."""
        # Add the used names from the old parser to the new one.
        copied_line_numbers = set()
        for l1, l2 in self._copied_ranges:
            copied_line_numbers.update(range(l1, l2 + 1))

        new_used_names = self._new_module.used_names
        for key, names in self._old_module.used_names.items():
            for name in names:
                if name.start_pos[0] in copied_line_numbers:
                    new_used_names.setdefault(key, []).append(name)

        # Add an endmarker.
        try:
            last_leaf = self._new_module.last_leaf()
            end_pos = list(last_leaf.end_pos)
        except IndexError:
            end_pos = [1, 0]
        lines = splitlines(self._prefix)
        assert len(lines) > 0
        if len(lines) == 1:
            end_pos[1] += len(lines[0])
        else:
            end_pos[0] += len(lines) - 1
            end_pos[1] = len(lines[-1])

        endmarker = EndMarker('', tuple(end_pos), self._prefix + self._last_prefix)
        endmarker.parent = self._new_module
        self._new_children.append(endmarker)

    def _diff_tokenize(self, lines, until_line, line_offset=0):
        is_first_token = True
        omitted_first_indent = False
        indents = []
        l = iter(lines)
        tokens = generate_tokens(lambda: next(l, ''), use_exact_op_types=True)
        stack = self._active_parser.pgen_parser.stack
        for typ, string, start_pos, prefix in tokens:
            start_pos = start_pos[0] + line_offset, start_pos[1]
            if typ == INDENT:
                indents.append(start_pos[1])
                if is_first_token:
                    omitted_first_indent = True
                    # We want to get rid of indents that are only here because
                    # we only parse part of the file. These indents would only
                    # get parsed as error leafs, which doesn't make any sense.
                    is_first_token = False
                    continue
            is_first_token = False

            if typ == DEDENT:
                indents.pop()
                if omitted_first_indent and not indents:
                    # We are done here, only thing that can come now is an
                    # endmarker or another dedented code block.
                    typ, string, start_pos, prefix = next(tokens)
                    if '\n' in prefix:
                        prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix)
                    else:
                        prefix = ''
                    yield TokenInfo(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix)
                    break
            elif typ == NEWLINE and start_pos[0] >= until_line:
                yield TokenInfo(typ, string, start_pos, prefix)
                # Check if the parser is actually in a valid suite state.
                if suite_or_file_input_is_valid(self._grammar, stack):
                    start_pos = start_pos[0] + 1, 0
                    while len(indents) > int(omitted_first_indent):
                        indents.pop()
                        yield TokenInfo(DEDENT, '', start_pos, '')

                    yield TokenInfo(ENDMARKER, '', start_pos, '')
                    break
                else:
                    continue

            yield TokenInfo(typ, string, start_pos, prefix)