jedi-fork/jedi/parser/fast.py

"""
Basically a parser that is faster, because it tries to parse only parts and if
anything changes, it only reparses the changed parts. But because it's not
finished (and still not working as I want), I won't document it any further.
"""
import copy
import re
import difflib

from jedi._compatibility import use_metaclass
from jedi import settings
from jedi.common import splitlines
from jedi.parser import ParserWithRecovery
from jedi.parser.tree import Module, search_ancestor, EndMarker, Flow
from jedi.parser.utils import parser_cache
from jedi import debug
from jedi.parser.tokenize import (generate_tokens, NEWLINE, TokenInfo,
                                  ENDMARKER, INDENT, DEDENT)


class CachedFastParser(type):
    """ This is a metaclass for caching `FastParser`. """
    def __call__(self, grammar, source, module_path=None):
        pi = parser_cache.get(module_path, None)
        if pi is None or not settings.fast_parser:
            return ParserWithRecovery(grammar, source, module_path)

        parser = pi.parser
        d = DiffParser(parser)
        new_lines = splitlines(source, keepends=True)
        parser.module = parser._parsed = d.update(new_lines)
        return parser


class FastParser(use_metaclass(CachedFastParser)):
    pass


def _merge_used_names(base_dict, other_dict):
    for key, names in other_dict.items():
        base_dict.setdefault(key, []).extend(names)


def _get_last_line(node_or_leaf):
    last_leaf = node_or_leaf.last_leaf()
    if last_leaf.type == 'error_leaf':
        typ = last_leaf.original_type
    else:
        typ = last_leaf.type
    if typ == 'newline':
        return last_leaf.start_pos[0]
    else:
        return last_leaf.end_pos[0]


def _flows_finished(grammar, stack):
    """
    if, while, for and try might not be finished, because another part might
    still be parsed.
    """
    for dfa, newstate, (symbol_number, nodes) in stack:
        if grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt',
                                                    'for_stmt', 'try_stmt'):
            return False
    return True


def suite_or_file_input_is_valid(grammar, stack):
    if not _flows_finished(grammar, stack):
        return False

    for dfa, newstate, (symbol_number, nodes) in reversed(stack):
        if grammar.number2symbol[symbol_number] == 'suite':
            # If only newline is in the suite, the suite is not valid, yet.
            return len(nodes) > 1
    # Not reaching a suite means that we're dealing with file_input levels
    # where there's no need for a valid statement in it. It can also be empty.
    return True


def _is_flow_node(node):
    try:
        value = node.children[0].value
    except AttributeError:
        return False
    return value in ('if', 'for', 'while', 'try')


def _last_leaf_is_newline(last_leaf):
    if last_leaf.prefix.endswith('\n'):
        return True
    if last_leaf.prefix:
        return False
    previous_leaf = last_leaf.get_previous_leaf()
    return (previous_leaf.type == 'newline' or
            previous_leaf.type == 'error_leaf' and
            previous_leaf.original_type == 'newline')


def _update_positions(nodes, line_offset):
    for node in nodes:
        try:
            children = node.children
        except AttributeError:
            # Is a leaf
            node.start_pos = node.start_pos[0] + line_offset, node.start_pos[1]
        else:
            _update_positions(children, line_offset)


class DiffParser(object):
    endmarker_type = 'endmarker'

    def __init__(self, parser):
        self._parser = parser
        self._grammar = self._parser._grammar
        self._old_module = parser.get_root_node()

    def _reset(self):
        self._copy_count = 0
        self._parser_count = 0

        self._parsed_until_line = 0
        self._copied_ranges = []

        self._old_children = self._old_module.children
        self._new_children = []
        self._new_module = Module(self._new_children)
        self._new_module.path = self._old_module.path
        self._new_module.used_names = {}
        self._prefix = ''

    def update(self, lines_new):
        '''
        The algorithm works as follows:

        Equal:
            - Assure that the start is a newline, otherwise parse until we get
              one.
            - Copy from parsed_until_line + 1 to max(i2 + 1)
            - Make sure that the indentation is correct (e.g. add DEDENT)
            - Add old and change positions
        Insert:
            - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not
              much more.
        Always:
            - Set parsed_until_line

        Returns the new module node.
        '''
        self._parser_lines_new = lines_new
        self._added_newline = False
        if lines_new[-1] != '':
            # The Python grammar needs a newline at the end of a file, but for
            # everything else we keep working with lines_new here.
            self._parser_lines_new = list(lines_new)
            self._parser_lines_new[-1] += '\n'
            self._added_newline = True

        self._reset()

        line_length = len(lines_new)
        lines_old = splitlines(self._parser.source, keepends=True)
        sm = difflib.SequenceMatcher(None, lines_old, self._parser_lines_new)
        debug.dbg('diff: line_lengths old: %s, new: %s' % (len(lines_old), line_length))
        for operation, i1, i2, j1, j2 in sm.get_opcodes():
            debug.dbg('diff %s old[%s:%s] new[%s:%s]',
                      operation, i1 + 1, i2, j1 + 1, j2)

            if j2 == line_length + int(self._added_newline):
                # The empty part after the last newline is not relevant.
                j2 -= 1

            if operation == 'equal':
                line_offset = j1 - i1
                self._copy_from_old_parser(line_offset, i2, j2)
            elif operation == 'replace':
                self._parse(until_line=j2)
            elif operation == 'insert':
                self._parse(until_line=j2)
            else:
                assert operation == 'delete'

        # Cleanup (setting endmarker, used_names)
        self._cleanup()
        if self._added_newline:
            self._parser.module = self._parser._parsed = self._new_module
            self._parser.remove_last_newline()
            self._parsed_until_line -= 1

        self._parser.source = ''.join(lines_new)
        self._old_module = self._new_module

        assert self._new_module.end_pos[0] == line_length

        return self._new_module

    def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new):
        while until_line_new > self._parsed_until_line:
            parsed_until_line_old = self._parsed_until_line - line_offset
            line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1)
            if line_stmt is None:
                # Parse 1 line at least. We don't need more, because we just
                # want to get into a state where the old parser has statements
                # again that can be copied (e.g. not lines within parentheses).
                self._parse(self._parsed_until_line + 1)
            else:
                p_children = line_stmt.parent.children
                index = p_children.index(line_stmt)

                # Match all the nodes that are in the wanted range.
                nodes = self._divide_nodes(p_children[index:], until_line_old)
                if nodes:
                    self._copy_count += 1
                    _update_positions(nodes, line_offset)
                    self._insert_nodes(nodes)

                    from_ = nodes[0].get_start_pos_of_prefix()[0]
                    to = _get_last_line(nodes[-1])
                    self._copied_ranges.append((from_, to))

                    debug.dbg('diff actually copy %s to %s', from_, to)
                # We have copied as much as possible (but definitely not too
                # much). Therefore we just parse the rest.
                # We might not reach the end, because there's a statement
                # that is not finished.
                self._parse(until_line_new)
                break

    def _get_old_line_stmt(self, old_line):
        leaf = self._old_module.get_leaf_for_position((old_line, 0), include_prefixes=True)

        if leaf.type == 'newline':
            leaf = leaf.get_next_leaf()
        if leaf.get_start_pos_of_prefix()[0] == old_line:
            node = leaf
            # TODO use leaf.get_definition one day when that one is working
            # well.
            while node.parent.type not in ('file_input', 'suite'):
                node = node.parent
            return node
        # Must be on the same line. Otherwise we need to parse that bit.
        return None

    def _insert_nodes(self, nodes):
        """
        Returns the scope that a node is a part of.
        """
        # Needs to be done before resetting the parsed
        before_node = self._get_before_insertion_node()

        last_leaf = nodes[-1].last_leaf()
        is_endmarker = last_leaf.type == self.endmarker_type
        if is_endmarker:
            self._parsed_until_line = last_leaf.start_pos[0]
            if _last_leaf_is_newline(last_leaf):
                self._parsed_until_line -= 1
        else:
            if last_leaf.type == 'newline':
                # Newlines end on the next line, which means that they would cover
                # the next line. That line is not fully parsed at this point.
                self._parsed_until_line = last_leaf.start_pos[0]
            else:
                self._parsed_until_line = last_leaf.end_pos[0]
        debug.dbg('set parsed_until %s', self._parsed_until_line)

        first_leaf = nodes[0].first_leaf()
        first_leaf.prefix = self._prefix + first_leaf.prefix
        self._prefix = ''
        if is_endmarker:
            self._prefix = last_leaf.prefix

            nodes = nodes[:-1]
            if not nodes:
                return self._new_module

        # Now the preparations are done. We are inserting the nodes.
        if before_node is None:  # Everything is empty.
            self._new_children += nodes
            new_parent = self._new_module
        else:
            assert nodes[0].type != 'newline'
            line_indentation = nodes[0].start_pos[1]
            new_parent = before_node.parent
            while True:
                p_children = new_parent.children
                if new_parent.type == 'suite':
                    # A suite starts with NEWLINE, ...
                    indentation = p_children[1].start_pos[1]
                else:
                    indentation = p_children[0].start_pos[1]

                if line_indentation < indentation:  # Dedent
                    # We might be at the most outer layer: modules. We
                    # don't want to depend on the first statement
                    # having the right indentation.
                    if new_parent.parent is not None:
                        new_parent = search_ancestor(
                            new_parent,
                            ('suite', 'file_input')
                        )
                        continue

                p_children += nodes
                assert new_parent.type in ('suite', 'file_input')
                break

        # Reset the parents
        for node in nodes:
            node.parent = new_parent
        if new_parent.type == 'suite':
            return new_parent.get_parent_scope()

        return new_parent

    def _get_before_insertion_node(self):
        if not self._new_children:
            return None

        line = self._parsed_until_line + 1
        node = self._new_module.last_leaf()
        while True:
            parent = node.parent
            if parent.type in ('suite', 'file_input'):
                assert node.end_pos[0] <= line
                assert node.end_pos[1] == 0
                return node
            node = parent

    def _divide_node(self, node, until_line):
        if node.type not in ('classdef', 'funcdef'):
            return None

        suite = node.children[-1]
        if suite.type != 'suite':
            return None

        nodes = self._divide_nodes(suite.children, until_line)

        if len(nodes) < 2:
            # A suite only with newline is not valid.
            return None

        new_node = copy.copy(node)
        new_suite = copy.copy(suite)

        # And now set the correct parents
        for child in nodes:
            child.parent = new_suite
        new_suite.children = nodes

        new_node.children = list(new_node.children)
        new_node.children[-1] = new_suite
        for child in new_node.children:
            child.parent = new_node
        return new_node

    def _copy_divided_nodes(self, nodes):
        parent = nodes[-1].last_leaf().get_parent_scope()
        if parent == nodes[0].get_parent_scope():
            check_nodes = nodes
        else:
            n = parent
            while n is not None:
                if isinstance(n, Flow):
                    parent = n.get_parent_scope()
                n = n.parent
            check_nodes = parent.children

        last_node = check_nodes[-1]

        if last_node.type == 'suite':
            parent = last_node
            check_nodes = parent.children
            last_node = check_nodes[-1]

        drop_node_count = 0
        if last_node.type in ('error_leaf', 'error_node'):
            # Error leafs/nodes don't have a defined start/end. Error
            # nodes might not end with a newline (e.g. if there's an
            # open `(`). Therefore ignore all of them unless they are
            # succeeded with valid parser state.
            n = last_node
            # In this while loop we try to remove until we find a newline.
            while True:
                drop_node_count += 1
                try:
                    n = check_nodes[drop_node_count]
                except IndexError:
                    break
                if n.last_leaf().type == 'newline':
                    break
        elif _is_flow_node(last_node):
            # If we just copy flows at the end, they might be continued
            # after the copy limit (in the new parser).
            drop_node_count += 1

        if drop_node_count:
            node = self._drop_last_node(nodes[-1], last_node, drop_node_count)
            if node is None:
                nodes = nodes[:-drop_node_count]
            else:
                nodes[-1] = node
        return nodes

    def _drop_last_node(self, base_node, last_node_to_drop, drop_node_count):
        if base_node == last_node_to_drop:
            return None

        last_node = base_node.children[-1]
        child = self._drop_last_node(last_node, last_node_to_drop, drop_node_count)

        base_node = copy.copy(base_node)
        if child is None:
            if base_node.type == 'suite' and len(base_node.children) <= 1 + drop_node_count:
                return None
            if base_node.type in ('classdef', 'funcdef'):
                return None

            base_node.children = base_node.children[:-drop_node_count]
        else:
            base_node.children = list(base_node.children)
            base_node.children[-1] = child
            child.parent = base_node

        for c in base_node.children:
            c.parent = base_node
        return base_node

    def _divide_nodes(self, nodes, until_line):
        """
        Breaks up scopes and returns only the part until the given line.

        Tries to get the parts it can safely get and ignores the rest.
        """
        new_nodes = []
        for i, child in enumerate(nodes):
            # TODO this check might take a bit of time for large files. We
            # might want to change this to do more intelligent guessing or
            # binary search.
            if _get_last_line(child) > until_line:
                node = self._divide_node(child, until_line)
                if node is not None:
                    new_nodes.append(node)
                break
            else:
                new_nodes.append(child)

        if new_nodes:
            return self._copy_divided_nodes(new_nodes)
        return new_nodes

    def _parse(self, until_line):
        """
        Parses at least until the given line, but might just parse more until a
        valid state is reached.
        """
        while until_line > self._parsed_until_line:
            node = self._try_parse_part(until_line)
            nodes = self._get_children_nodes(node)
            self._insert_nodes(nodes)
            _merge_used_names(
                self._new_module.used_names,
                node.used_names
            )

    def _get_children_nodes(self, node):
        nodes = node.children
        first_element = nodes[0]
        # TODO this looks very strange...
        if first_element.type == 'error_leaf' and \
                first_element.original_type == 'indent':
            assert False, str(nodes)
            #assert nodes[-1].type == 'dedent'
            ## This means that the start and end leaf
            #nodes = nodes[1:-1] + [nodes[-1]]

        return nodes

    def _try_parse_part(self, until_line):
        """
        Sets up a normal parser that uses a spezialized tokenizer to only parse
        until a certain position (or a bit longer if the statement hasn't
        ended.
        """
        self._parser_count += 1
        # TODO speed up, shouldn't copy the whole list all the time.
        # memoryview?
        lines_after = self._parser_lines_new[self._parsed_until_line:]
        #print('parse_content', self._parsed_until_line, lines_after, until_line)
        tokenizer = self._diff_tokenize(
            lines_after,
            until_line,
            line_offset=self._parsed_until_line
        )
        self._active_parser = ParserWithRecovery(
            self._grammar,
            source='\n',
            start_parsing=False
        )
        return self._active_parser.parse(tokenizer=tokenizer)

    def _cleanup(self):
        """Add used names and an end marker."""
        # Add the used names from the old parser to the new one.
        copied_line_numbers = set()
        for l1, l2 in self._copied_ranges:
            copied_line_numbers.update(range(l1, l2 + 1))

        new_used_names = self._new_module.used_names
        for key, names in self._old_module.used_names.items():
            for name in names:
                if name.start_pos[0] in copied_line_numbers:
                    new_used_names.setdefault(key, []).append(name)

        # Add an endmarker.
        try:
            last_leaf = self._new_module.last_leaf()
            end_pos = list(last_leaf.end_pos)
        except IndexError:
            end_pos = [1, 0]
        lines = splitlines(self._prefix)
        assert len(lines) > 0
        if len(lines) == 1:
            end_pos[1] += len(lines[0])
        else:
            end_pos[0] += len(lines) - 1
            end_pos[1] = len(lines[-1])

        endmarker = EndMarker('', tuple(end_pos), self._prefix)
        endmarker.parent = self._new_module
        self._new_children.append(endmarker)

    def _diff_tokenize(self, lines, until_line, line_offset=0):
        is_first_token = True
        omitted_first_indent = False
        indents = []
        l = iter(lines)
        tokens = generate_tokens(lambda: next(l, ''), use_exact_op_types=True)
        stack = self._active_parser.pgen_parser.stack
        for typ, string, start_pos, prefix in tokens:
            start_pos = start_pos[0] + line_offset, start_pos[1]
            if typ == INDENT:
                indents.append(start_pos[1])
                if is_first_token:
                    omitted_first_indent = True
                    # We want to get rid of indents that are only here because
                    # we only parse part of the file. These indents would only
                    # get parsed as error leafs, which doesn't make any sense.
                    is_first_token = False
                    continue
            is_first_token = False

            if typ == DEDENT:
                indents.pop()
                if omitted_first_indent and not indents:
                    # We are done here, only thing that can come now is an
                    # endmarker or another dedented code block.
                    typ, string, start_pos, prefix = next(tokens)
                    if '\n' in prefix:
                        prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix)
                    else:
                        prefix = ''
                    yield TokenInfo(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix)
                    break
            elif typ == NEWLINE and start_pos[0] >= until_line:
                yield TokenInfo(typ, string, start_pos, prefix)
                # Check if the parser is actually in a valid suite state.
                if suite_or_file_input_is_valid(self._grammar, stack):
                    start_pos = start_pos[0] + 1, 0
                    while len(indents) > int(omitted_first_indent):
                        indents.pop()
                        yield TokenInfo(DEDENT, '', start_pos, '')

                    yield TokenInfo(ENDMARKER, '', start_pos, '')
                    break
                else:
                    continue

            yield TokenInfo(typ, string, start_pos, prefix)