Move the parser.fast module to parser.diff.

2017-01-10 19:15:47 +01:00
parent 1edccbe2c3
commit 425fba5e95
6 changed files with 6 additions and 6 deletions
--- a/jedi/parser/diff.py
+++ b/jedi/parser/diff.py
@@ -0,0 +1,580 @@
+"""
+Basically a parser that is faster, because it tries to parse only parts and if
+anything changes, it only reparses the changed parts. But because it's not
+finished (and still not working as I want), I won't document it any further.
+"""
+import copy
+import re
+import difflib
+
+from jedi._compatibility import use_metaclass
+from jedi import settings
+from jedi.common import splitlines
+from jedi.parser import ParserWithRecovery
+from jedi.parser.tree import Module, search_ancestor, EndMarker, Flow
+from jedi.parser.utils import parser_cache
+from jedi import debug
+from jedi.parser.tokenize import (generate_tokens, NEWLINE, TokenInfo,
+                                  ENDMARKER, INDENT, DEDENT)
+
+
+class CachedFastParser(type):
+    """ This is a metaclass for caching `FastParser`. """
+    def __call__(self, grammar, source, module_path=None):
+        pi = parser_cache.get(module_path, None)
+        if pi is None or not settings.fast_parser:
+            return ParserWithRecovery(grammar, source, module_path)
+
+        parser = pi.parser
+        d = DiffParser(parser)
+        new_lines = splitlines(source, keepends=True)
+        parser.module = parser._parsed = d.update(new_lines)
+        return parser
+
+
+class FastParser(use_metaclass(CachedFastParser)):
+    pass
+
+
+def _merge_used_names(base_dict, other_dict):
+    for key, names in other_dict.items():
+        base_dict.setdefault(key, []).extend(names)
+
+
+def _get_last_line(node_or_leaf):
+    last_leaf = node_or_leaf.last_leaf()
+    if last_leaf.type == 'error_leaf':
+        typ = last_leaf.original_type
+    else:
+        typ = last_leaf.type
+    if typ == 'newline':
+        return last_leaf.start_pos[0]
+    else:
+        return last_leaf.end_pos[0]
+
+
+def _flows_finished(grammar, stack):
+    """
+    if, while, for and try might not be finished, because another part might
+    still be parsed.
+    """
+    for dfa, newstate, (symbol_number, nodes) in stack:
+        if grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt',
+                                                    'for_stmt', 'try_stmt'):
+            return False
+    return True
+
+
+def suite_or_file_input_is_valid(grammar, stack):
+    if not _flows_finished(grammar, stack):
+        return False
+
+    for dfa, newstate, (symbol_number, nodes) in reversed(stack):
+        if grammar.number2symbol[symbol_number] == 'suite':
+            # If only newline is in the suite, the suite is not valid, yet.
+            return len(nodes) > 1
+    # Not reaching a suite means that we're dealing with file_input levels
+    # where there's no need for a valid statement in it. It can also be empty.
+    return True
+
+
+def _is_flow_node(node):
+    try:
+        value = node.children[0].value
+    except AttributeError:
+        return False
+    return value in ('if', 'for', 'while', 'try')
+
+
+def _last_leaf_is_newline(last_leaf):
+    if last_leaf.prefix.endswith('\n'):
+        return True
+    if last_leaf.prefix:
+        return False
+    previous_leaf = last_leaf.get_previous_leaf()
+    return (previous_leaf.type == 'newline' or
+            previous_leaf.type == 'error_leaf' and
+            previous_leaf.original_type == 'newline')
+
+
+def _update_positions(nodes, line_offset):
+    for node in nodes:
+        try:
+            children = node.children
+        except AttributeError:
+            # Is a leaf
+            node.start_pos = node.start_pos[0] + line_offset, node.start_pos[1]
+        else:
+            _update_positions(children, line_offset)
+
+
+class DiffParser(object):
+    endmarker_type = 'endmarker'
+
+    def __init__(self, parser):
+        self._parser = parser
+        self._grammar = self._parser._grammar
+        self._old_module = parser.get_root_node()
+
+    def _reset(self):
+        self._copy_count = 0
+        self._parser_count = 0
+
+        self._parsed_until_line = 0
+        self._copied_ranges = []
+
+        self._old_children = self._old_module.children
+        self._new_children = []
+        self._new_module = Module(self._new_children)
+        self._new_module.path = self._old_module.path
+        self._new_module.used_names = {}
+        self._prefix = ''
+
+    def update(self, lines_new):
+        '''
+        The algorithm works as follows:
+
+        Equal:
+            - Assure that the start is a newline, otherwise parse until we get
+              one.
+            - Copy from parsed_until_line + 1 to max(i2 + 1)
+            - Make sure that the indentation is correct (e.g. add DEDENT)
+            - Add old and change positions
+        Insert:
+            - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not
+              much more.
+        Always:
+            - Set parsed_until_line
+
+        Returns the new module node.
+        '''
+        self._parser_lines_new = lines_new
+        self._added_newline = False
+        if lines_new[-1] != '':
+            # The Python grammar needs a newline at the end of a file, but for
+            # everything else we keep working with lines_new here.
+            self._parser_lines_new = list(lines_new)
+            self._parser_lines_new[-1] += '\n'
+            self._added_newline = True
+
+        self._reset()
+
+        line_length = len(lines_new)
+        lines_old = splitlines(self._parser.source, keepends=True)
+        sm = difflib.SequenceMatcher(None, lines_old, self._parser_lines_new)
+        debug.dbg('diff: line_lengths old: %s, new: %s' % (len(lines_old), line_length))
+        for operation, i1, i2, j1, j2 in sm.get_opcodes():
+            debug.dbg('diff %s old[%s:%s] new[%s:%s]',
+                      operation, i1 + 1, i2, j1 + 1, j2)
+
+            if j2 == line_length + int(self._added_newline):
+                # The empty part after the last newline is not relevant.
+                j2 -= 1
+
+            if operation == 'equal':
+                line_offset = j1 - i1
+                self._copy_from_old_parser(line_offset, i2, j2)
+            elif operation == 'replace':
+                self._parse(until_line=j2)
+            elif operation == 'insert':
+                self._parse(until_line=j2)
+            else:
+                assert operation == 'delete'
+
+        # Cleanup (setting endmarker, used_names)
+        self._cleanup()
+        if self._added_newline:
+            self._parser.module = self._parser._parsed = self._new_module
+            self._parser.remove_last_newline()
+            self._parsed_until_line -= 1
+
+        self._parser.source = ''.join(lines_new)
+        self._old_module = self._new_module
+
+        assert self._new_module.end_pos[0] == line_length
+
+        return self._new_module
+
+    def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new):
+        while until_line_new > self._parsed_until_line:
+            parsed_until_line_old = self._parsed_until_line - line_offset
+            line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1)
+            if line_stmt is None:
+                # Parse 1 line at least. We don't need more, because we just
+                # want to get into a state where the old parser has statements
+                # again that can be copied (e.g. not lines within parentheses).
+                self._parse(self._parsed_until_line + 1)
+            else:
+                p_children = line_stmt.parent.children
+                index = p_children.index(line_stmt)
+
+                # Match all the nodes that are in the wanted range.
+                nodes = self._divide_nodes(p_children[index:], until_line_old)
+                if nodes:
+                    self._copy_count += 1
+                    _update_positions(nodes, line_offset)
+                    self._insert_nodes(nodes)
+
+                    from_ = nodes[0].get_start_pos_of_prefix()[0]
+                    to = _get_last_line(nodes[-1])
+                    self._copied_ranges.append((from_, to))
+
+                    debug.dbg('diff actually copy %s to %s', from_, to)
+                # We have copied as much as possible (but definitely not too
+                # much). Therefore we just parse the rest.
+                # We might not reach the end, because there's a statement
+                # that is not finished.
+                self._parse(until_line_new)
+                break
+
+    def _get_old_line_stmt(self, old_line):
+        leaf = self._old_module.get_leaf_for_position((old_line, 0), include_prefixes=True)
+
+        if leaf.type == 'newline':
+            leaf = leaf.get_next_leaf()
+        if leaf.get_start_pos_of_prefix()[0] == old_line:
+            node = leaf
+            # TODO use leaf.get_definition one day when that one is working
+            # well.
+            while node.parent.type not in ('file_input', 'suite'):
+                node = node.parent
+            return node
+        # Must be on the same line. Otherwise we need to parse that bit.
+        return None
+
+    def _insert_nodes(self, nodes):
+        """
+        Returns the scope that a node is a part of.
+        """
+        # Needs to be done before resetting the parsed
+        before_node = self._get_before_insertion_node()
+
+        last_leaf = nodes[-1].last_leaf()
+        is_endmarker = last_leaf.type == self.endmarker_type
+        if is_endmarker:
+            self._parsed_until_line = last_leaf.start_pos[0]
+            if _last_leaf_is_newline(last_leaf):
+                self._parsed_until_line -= 1
+        else:
+            if last_leaf.type == 'newline':
+                # Newlines end on the next line, which means that they would cover
+                # the next line. That line is not fully parsed at this point.
+                self._parsed_until_line = last_leaf.start_pos[0]
+            else:
+                self._parsed_until_line = last_leaf.end_pos[0]
+        debug.dbg('set parsed_until %s', self._parsed_until_line)
+
+        first_leaf = nodes[0].first_leaf()
+        first_leaf.prefix = self._prefix + first_leaf.prefix
+        self._prefix = ''
+        if is_endmarker:
+            self._prefix = last_leaf.prefix
+
+            nodes = nodes[:-1]
+            if not nodes:
+                return self._new_module
+
+        # Now the preparations are done. We are inserting the nodes.
+        if before_node is None:  # Everything is empty.
+            self._new_children += nodes
+            new_parent = self._new_module
+        else:
+            assert nodes[0].type != 'newline'
+            line_indentation = nodes[0].start_pos[1]
+            new_parent = before_node.parent
+            while True:
+                p_children = new_parent.children
+                if new_parent.type == 'suite':
+                    # A suite starts with NEWLINE, ...
+                    indentation = p_children[1].start_pos[1]
+                else:
+                    indentation = p_children[0].start_pos[1]
+
+                if line_indentation < indentation:  # Dedent
+                    # We might be at the most outer layer: modules. We
+                    # don't want to depend on the first statement
+                    # having the right indentation.
+                    if new_parent.parent is not None:
+                        new_parent = search_ancestor(
+                            new_parent,
+                            ('suite', 'file_input')
+                        )
+                        continue
+
+                p_children += nodes
+                assert new_parent.type in ('suite', 'file_input')
+                break
+
+        # Reset the parents
+        for node in nodes:
+            node.parent = new_parent
+        if new_parent.type == 'suite':
+            return new_parent.get_parent_scope()
+
+        return new_parent
+
+    def _get_before_insertion_node(self):
+        if not self._new_children:
+            return None
+
+        line = self._parsed_until_line + 1
+        node = self._new_module.last_leaf()
+        while True:
+            parent = node.parent
+            if parent.type in ('suite', 'file_input'):
+                assert node.end_pos[0] <= line
+                assert node.end_pos[1] == 0
+                return node
+            node = parent
+
+    def _divide_node(self, node, until_line):
+        if node.type not in ('classdef', 'funcdef'):
+            return None
+
+        suite = node.children[-1]
+        if suite.type != 'suite':
+            return None
+
+        nodes = self._divide_nodes(suite.children, until_line)
+
+        if len(nodes) < 2:
+            # A suite only with newline is not valid.
+            return None
+
+        new_node = copy.copy(node)
+        new_suite = copy.copy(suite)
+
+        # And now set the correct parents
+        for child in nodes:
+            child.parent = new_suite
+        new_suite.children = nodes
+
+        new_node.children = list(new_node.children)
+        new_node.children[-1] = new_suite
+        for child in new_node.children:
+            child.parent = new_node
+        return new_node
+
+    def _copy_divided_nodes(self, nodes):
+        parent = nodes[-1].last_leaf().get_parent_scope()
+        if parent == nodes[0].get_parent_scope():
+            check_nodes = nodes
+        else:
+            n = parent
+            while n is not None:
+                if isinstance(n, Flow):
+                    parent = n.get_parent_scope()
+                n = n.parent
+            check_nodes = parent.children
+
+        last_node = check_nodes[-1]
+
+        if last_node.type == 'suite':
+            parent = last_node
+            check_nodes = parent.children
+            last_node = check_nodes[-1]
+
+        drop_node_count = 0
+        if last_node.type in ('error_leaf', 'error_node'):
+            # Error leafs/nodes don't have a defined start/end. Error
+            # nodes might not end with a newline (e.g. if there's an
+            # open `(`). Therefore ignore all of them unless they are
+            # succeeded with valid parser state.
+            n = last_node
+            # In this while loop we try to remove until we find a newline.
+            while True:
+                drop_node_count += 1
+                try:
+                    n = check_nodes[drop_node_count]
+                except IndexError:
+                    break
+                if n.last_leaf().type == 'newline':
+                    break
+        elif _is_flow_node(last_node):
+            # If we just copy flows at the end, they might be continued
+            # after the copy limit (in the new parser).
+            drop_node_count += 1
+
+        if drop_node_count:
+            node = self._drop_last_node(nodes[-1], last_node, drop_node_count)
+            if node is None:
+                nodes = nodes[:-drop_node_count]
+            else:
+                nodes[-1] = node
+        return nodes
+
+    def _drop_last_node(self, base_node, last_node_to_drop, drop_node_count):
+        if base_node == last_node_to_drop:
+            return None
+
+        last_node = base_node.children[-1]
+        child = self._drop_last_node(last_node, last_node_to_drop, drop_node_count)
+
+        base_node = copy.copy(base_node)
+        if child is None:
+            if base_node.type == 'suite' and len(base_node.children) <= 1 + drop_node_count:
+                return None
+            if base_node.type in ('classdef', 'funcdef'):
+                return None
+
+            base_node.children = base_node.children[:-drop_node_count]
+        else:
+            base_node.children = list(base_node.children)
+            base_node.children[-1] = child
+            child.parent = base_node
+
+        for c in base_node.children:
+            c.parent = base_node
+        return base_node
+
+    def _divide_nodes(self, nodes, until_line):
+        """
+        Breaks up scopes and returns only the part until the given line.
+
+        Tries to get the parts it can safely get and ignores the rest.
+        """
+        new_nodes = []
+        for i, child in enumerate(nodes):
+            # TODO this check might take a bit of time for large files. We
+            # might want to change this to do more intelligent guessing or
+            # binary search.
+            if _get_last_line(child) > until_line:
+                node = self._divide_node(child, until_line)
+                if node is not None:
+                    new_nodes.append(node)
+                break
+            else:
+                new_nodes.append(child)
+
+        if new_nodes:
+            return self._copy_divided_nodes(new_nodes)
+        return new_nodes
+
+    def _parse(self, until_line):
+        """
+        Parses at least until the given line, but might just parse more until a
+        valid state is reached.
+        """
+        while until_line > self._parsed_until_line:
+            node = self._try_parse_part(until_line)
+            nodes = self._get_children_nodes(node)
+            self._insert_nodes(nodes)
+            _merge_used_names(
+                self._new_module.used_names,
+                node.used_names
+            )
+
+    def _get_children_nodes(self, node):
+        nodes = node.children
+        first_element = nodes[0]
+        # TODO this looks very strange...
+        if first_element.type == 'error_leaf' and \
+                first_element.original_type == 'indent':
+            assert False, str(nodes)
+            #assert nodes[-1].type == 'dedent'
+            ## This means that the start and end leaf
+            #nodes = nodes[1:-1] + [nodes[-1]]
+
+        return nodes
+
+    def _try_parse_part(self, until_line):
+        """
+        Sets up a normal parser that uses a spezialized tokenizer to only parse
+        until a certain position (or a bit longer if the statement hasn't
+        ended.
+        """
+        self._parser_count += 1
+        # TODO speed up, shouldn't copy the whole list all the time.
+        # memoryview?
+        lines_after = self._parser_lines_new[self._parsed_until_line:]
+        #print('parse_content', self._parsed_until_line, lines_after, until_line)
+        tokenizer = self._diff_tokenize(
+            lines_after,
+            until_line,
+            line_offset=self._parsed_until_line
+        )
+        self._active_parser = ParserWithRecovery(
+            self._grammar,
+            source='\n',
+            start_parsing=False
+        )
+        return self._active_parser.parse(tokenizer=tokenizer)
+
+    def _cleanup(self):
+        """Add used names and an end marker."""
+        # Add the used names from the old parser to the new one.
+        copied_line_numbers = set()
+        for l1, l2 in self._copied_ranges:
+            copied_line_numbers.update(range(l1, l2 + 1))
+
+        new_used_names = self._new_module.used_names
+        for key, names in self._old_module.used_names.items():
+            for name in names:
+                if name.start_pos[0] in copied_line_numbers:
+                    new_used_names.setdefault(key, []).append(name)
+
+        # Add an endmarker.
+        try:
+            last_leaf = self._new_module.last_leaf()
+            end_pos = list(last_leaf.end_pos)
+        except IndexError:
+            end_pos = [1, 0]
+        lines = splitlines(self._prefix)
+        assert len(lines) > 0
+        if len(lines) == 1:
+            end_pos[1] += len(lines[0])
+        else:
+            end_pos[0] += len(lines) - 1
+            end_pos[1] = len(lines[-1])
+
+        endmarker = EndMarker('', tuple(end_pos), self._prefix)
+        endmarker.parent = self._new_module
+        self._new_children.append(endmarker)
+
+    def _diff_tokenize(self, lines, until_line, line_offset=0):
+        is_first_token = True
+        omitted_first_indent = False
+        indents = []
+        l = iter(lines)
+        tokens = generate_tokens(lambda: next(l, ''), use_exact_op_types=True)
+        stack = self._active_parser.pgen_parser.stack
+        for typ, string, start_pos, prefix in tokens:
+            start_pos = start_pos[0] + line_offset, start_pos[1]
+            if typ == INDENT:
+                indents.append(start_pos[1])
+                if is_first_token:
+                    omitted_first_indent = True
+                    # We want to get rid of indents that are only here because
+                    # we only parse part of the file. These indents would only
+                    # get parsed as error leafs, which doesn't make any sense.
+                    is_first_token = False
+                    continue
+            is_first_token = False
+
+            if typ == DEDENT:
+                indents.pop()
+                if omitted_first_indent and not indents:
+                    # We are done here, only thing that can come now is an
+                    # endmarker or another dedented code block.
+                    typ, string, start_pos, prefix = next(tokens)
+                    if '\n' in prefix:
+                        prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix)
+                    else:
+                        prefix = ''
+                    yield TokenInfo(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix)
+                    break
+            elif typ == NEWLINE and start_pos[0] >= until_line:
+                yield TokenInfo(typ, string, start_pos, prefix)
+                # Check if the parser is actually in a valid suite state.
+                if suite_or_file_input_is_valid(self._grammar, stack):
+                    start_pos = start_pos[0] + 1, 0
+                    while len(indents) > int(omitted_first_indent):
+                        indents.pop()
+                        yield TokenInfo(DEDENT, '', start_pos, '')
+
+                    yield TokenInfo(ENDMARKER, '', start_pos, '')
+                    break
+                else:
+                    continue
+
+            yield TokenInfo(typ, string, start_pos, prefix)