""" Basically a contains parser that is faster, because it tries to parse only parts and if anything changes, it only reparses the changed parts. It works with a simple diff in the beginning and will try to reuse old parser fragments. """ import re import difflib from collections import namedtuple import logging from parso.utils import splitlines from parso.python.parser import Parser, remove_last_newline from parso.python.tree import EndMarker from parso.tokenize import (generate_tokens, NEWLINE, TokenInfo, ENDMARKER, INDENT, DEDENT) def _get_last_line(node_or_leaf): last_leaf = node_or_leaf.get_last_leaf() if _ends_with_newline(last_leaf): return last_leaf.start_pos[0] else: return last_leaf.end_pos[0] def _ends_with_newline(leaf, suffix=''): if leaf.type == 'error_leaf': typ = leaf.original_type else: typ = leaf.type return typ == 'newline' or suffix.endswith('\n') def _flows_finished(pgen_grammar, stack): """ if, while, for and try might not be finished, because another part might still be parsed. """ for dfa, newstate, (symbol_number, nodes) in stack: if pgen_grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'): return False return True def suite_or_file_input_is_valid(pgen_grammar, stack): if not _flows_finished(pgen_grammar, stack): return False for dfa, newstate, (symbol_number, nodes) in reversed(stack): if pgen_grammar.number2symbol[symbol_number] == 'suite': # If only newline is in the suite, the suite is not valid, yet. return len(nodes) > 1 # Not reaching a suite means that we're dealing with file_input levels # where there's no need for a valid statement in it. It can also be empty. return True def _is_flow_node(node): try: value = node.children[0].value except AttributeError: return False return value in ('if', 'for', 'while', 'try') class _PositionUpdatingFinished(Exception): pass def _update_positions(nodes, line_offset, last_leaf): for node in nodes: try: children = node.children except AttributeError: # Is a leaf node.line += line_offset if node is last_leaf: raise _PositionUpdatingFinished else: _update_positions(children, line_offset, last_leaf) class DiffParser(object): """ An advanced form of parsing a file faster. Unfortunately comes with huge side effects. It changes the given module. """ def __init__(self, pgen_grammar, module): self._pgen_grammar = pgen_grammar self._module = module def _reset(self): self._copy_count = 0 self._parser_count = 0 self._nodes_stack = _NodesStack(self._module) def update(self, old_lines, new_lines): ''' The algorithm works as follows: Equal: - Assure that the start is a newline, otherwise parse until we get one. - Copy from parsed_until_line + 1 to max(i2 + 1) - Make sure that the indentation is correct (e.g. add DEDENT) - Add old and change positions Insert: - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not much more. Returns the new module node. ''' logging.debug('diff parser start') # Reset the used names cache so they get regenerated. self._module._used_names = None self._parser_lines_new = new_lines self._added_newline = False if new_lines[-1] != '': # The Python grammar needs a newline at the end of a file, but for # everything else we keep working with new_lines here. self._parser_lines_new = list(new_lines) self._parser_lines_new[-1] += '\n' self._parser_lines_new.append('') self._added_newline = True self._reset() line_length = len(new_lines) sm = difflib.SequenceMatcher(None, old_lines, self._parser_lines_new) opcodes = sm.get_opcodes() logging.debug('diff parser calculated') logging.debug('diff: line_lengths old: %s, new: %s' % (len(old_lines), line_length)) for operation, i1, i2, j1, j2 in opcodes: logging.debug('diff %s old[%s:%s] new[%s:%s]', operation, i1 + 1, i2, j1 + 1, j2) if j2 == line_length + int(self._added_newline): # The empty part after the last newline is not relevant. j2 -= 1 if operation == 'equal': line_offset = j1 - i1 self._copy_from_old_parser(line_offset, i2, j2) elif operation == 'replace': self._parse(until_line=j2) elif operation == 'insert': self._parse(until_line=j2) else: assert operation == 'delete' # With this action all change will finally be applied and we have a # changed module. self._nodes_stack.close() if self._added_newline: remove_last_newline(self._module) last_pos = self._module.end_pos[0] if last_pos != line_length: current_lines = splitlines(self._module.get_code(), keepends=True) diff = difflib.unified_diff(current_lines, new_lines) raise Exception( "There's an issue (%s != %s) with the diff parser. Please report:\n%s" % (last_pos, line_length, ''.join(diff)) ) logging.debug('diff parser end') return self._module def _enabled_debugging(self, old_lines, lines_new): if self._module.get_code() != ''.join(lines_new): logging.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new)) def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new): copied_nodes = [None] last_until_line = -1 while until_line_new > self._nodes_stack.parsed_until_line: parsed_until_line_old = self._nodes_stack.parsed_until_line - line_offset line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1) if line_stmt is None: # Parse 1 line at least. We don't need more, because we just # want to get into a state where the old parser has statements # again that can be copied (e.g. not lines within parentheses). self._parse(self._nodes_stack.parsed_until_line + 1) elif not copied_nodes: # We have copied as much as possible (but definitely not too # much). Therefore we just parse the rest. # We might not reach the end, because there's a statement # that is not finished. self._parse(until_line_new) else: p_children = line_stmt.parent.children index = p_children.index(line_stmt) copied_nodes = self._nodes_stack.copy_nodes( p_children[index:], until_line_old, line_offset ) # Match all the nodes that are in the wanted range. if copied_nodes: self._copy_count += 1 from_ = copied_nodes[0].get_start_pos_of_prefix()[0] + line_offset to = self._nodes_stack.parsed_until_line logging.debug('diff actually copy %s to %s', from_, to) # Since there are potential bugs that might loop here endlessly, we # just stop here. assert last_until_line != self._nodes_stack.parsed_until_line \ or not copied_nodes, last_until_line last_until_line = self._nodes_stack.parsed_until_line def _get_old_line_stmt(self, old_line): leaf = self._module.get_leaf_for_position((old_line, 0), include_prefixes=True) if _ends_with_newline(leaf): leaf = leaf.get_next_leaf() if leaf.get_start_pos_of_prefix()[0] == old_line: node = leaf while node.parent.type not in ('file_input', 'suite'): node = node.parent return node # Must be on the same line. Otherwise we need to parse that bit. return None def _get_before_insertion_node(self): if self._nodes_stack.is_empty(): return None line = self._nodes_stack.parsed_until_line + 1 node = self._new_module.get_last_leaf() while True: parent = node.parent if parent.type in ('suite', 'file_input'): assert node.end_pos[0] <= line assert node.end_pos[1] == 0 or '\n' in self._prefix return node node = parent def _parse(self, until_line): """ Parses at least until the given line, but might just parse more until a valid state is reached. """ last_until_line = 0 while until_line > self._nodes_stack.parsed_until_line: node = self._try_parse_part(until_line) nodes = self._get_children_nodes(node) #self._insert_nodes(nodes) self._nodes_stack.add_parsed_nodes(nodes) logging.debug( 'parse part %s to %s (to %s in parser)', nodes[0].get_start_pos_of_prefix()[0], self._nodes_stack.parsed_until_line, node.end_pos[0] - 1 ) # Since the tokenizer sometimes has bugs, we cannot be sure that # this loop terminates. Therefore assert that there's always a # change. assert last_until_line != self._nodes_stack.parsed_until_line, last_until_line last_until_line = self._nodes_stack.parsed_until_line def _get_children_nodes(self, node): nodes = node.children first_element = nodes[0] # TODO this looks very strange... if first_element.type == 'error_leaf' and \ first_element.original_type == 'indent': assert False, str(nodes) return nodes def _try_parse_part(self, until_line): """ Sets up a normal parser that uses a spezialized tokenizer to only parse until a certain position (or a bit longer if the statement hasn't ended. """ self._parser_count += 1 # TODO speed up, shouldn't copy the whole list all the time. # memoryview? parsed_until_line = self._nodes_stack.parsed_until_line lines_after = self._parser_lines_new[parsed_until_line:] #print('parse_content', parsed_until_line, lines_after, until_line) tokens = self._diff_tokenize( lines_after, until_line, line_offset=parsed_until_line ) self._active_parser = Parser( self._pgen_grammar, error_recovery=True ) return self._active_parser.parse(tokens=tokens) def _diff_tokenize(self, lines, until_line, line_offset=0): is_first_token = True omitted_first_indent = False indents = [] tokens = generate_tokens(lines, use_exact_op_types=True) stack = self._active_parser.pgen_parser.stack for typ, string, start_pos, prefix in tokens: start_pos = start_pos[0] + line_offset, start_pos[1] if typ == INDENT: indents.append(start_pos[1]) if is_first_token: omitted_first_indent = True # We want to get rid of indents that are only here because # we only parse part of the file. These indents would only # get parsed as error leafs, which doesn't make any sense. is_first_token = False continue is_first_token = False if typ == DEDENT: indents.pop() if omitted_first_indent and not indents: # We are done here, only thing that can come now is an # endmarker or another dedented code block. typ, string, start_pos, prefix = next(tokens) if '\n' in prefix: prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix) else: prefix = '' yield TokenInfo(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix) break elif typ == NEWLINE and start_pos[0] >= until_line: yield TokenInfo(typ, string, start_pos, prefix) # Check if the parser is actually in a valid suite state. if suite_or_file_input_is_valid(self._pgen_grammar, stack): start_pos = start_pos[0] + 1, 0 while len(indents) > int(omitted_first_indent): indents.pop() yield TokenInfo(DEDENT, '', start_pos, '') yield TokenInfo(ENDMARKER, '', start_pos, '') break else: continue yield TokenInfo(typ, string, start_pos, prefix) class _NodesStackNode(object): ChildrenGroup = namedtuple('ChildrenGroup', 'children line_offset last_line_offset_leaf') def __init__(self, tree_node, parent=None): self.tree_node = tree_node self.children_groups = [] self.parent = parent def close(self): children = [] for children_part, line_offset, last_line_offset_leaf in self.children_groups: if line_offset != 0: try: _update_positions( children_part, line_offset, last_line_offset_leaf) except _PositionUpdatingFinished: pass children += children_part self.tree_node.children = children # Reset the parents for node in children: node.parent = self.tree_node def add(self, children, line_offset=0, last_line_offset_leaf=None): group = self.ChildrenGroup(children, line_offset, last_line_offset_leaf) self.children_groups.append(group) def get_last_line(self, suffix): line = 0 if self.children_groups: children_group = self.children_groups[-1] last_leaf = children_group.children[-1].get_last_leaf() line = last_leaf.end_pos[0] # Calculate the line offsets offset = children_group.line_offset if offset: # In case the line_offset is not applied to this specific leaf, # just ignore it. if last_leaf.line <= children_group.last_line_offset_leaf.line: line += children_group.line_offset # Newlines end on the next line, which means that they would cover # the next line. That line is not fully parsed at this point. if _ends_with_newline(last_leaf, suffix): line -= 1 line += suffix.count('\n') return line class _NodesStack(object): endmarker_type = 'endmarker' def __init__(self, module): # Top of stack self._tos = self._base_node = _NodesStackNode(module) self._module = module self._last_prefix = '' self.prefix = '' def is_empty(self): return not self._base_node.children @property def parsed_until_line(self): return self._tos.get_last_line(self.prefix) def _get_insertion_node(self, indentation_node): indentation = indentation_node.start_pos[1] # find insertion node node = self._tos while True: tree_node = node.tree_node if tree_node.type == 'suite': # A suite starts with NEWLINE, ... node_indentation = tree_node.children[1].start_pos[1] if indentation >= node_indentation: # Not a Dedent # We might be at the most outer layer: modules. We # don't want to depend on the first statement # having the right indentation. return node elif tree_node.type == 'file_input': return node node = self._close_tos() def _close_tos(self): self._tos.close() self._tos = self._tos.parent return self._tos def add_parsed_nodes(self, tree_nodes): tree_nodes = self._remove_endmarker(tree_nodes) if not tree_nodes: return assert tree_nodes[0].type != 'newline' node = self._get_insertion_node(tree_nodes[0]) assert node.tree_node.type in ('suite', 'file_input') node.add(tree_nodes) self._update_tos(tree_nodes[-1]) def _remove_endmarker(self, tree_nodes): """ Helps cleaning up the tree nodes that get inserted. """ last_leaf = tree_nodes[-1].get_last_leaf() is_endmarker = last_leaf.type == self.endmarker_type self._last_prefix = '' if is_endmarker: try: separation = last_leaf.prefix.rindex('\n') except ValueError: pass else: # Remove the whitespace part of the prefix after a newline. # That is not relevant if parentheses were opened. Always parse # until the end of a line. last_leaf.prefix, self._last_prefix = \ last_leaf.prefix[:separation + 1], last_leaf.prefix[separation + 1:] first_leaf = tree_nodes[0].get_first_leaf() first_leaf.prefix = self.prefix + first_leaf.prefix self.prefix = '' if is_endmarker: self.prefix = last_leaf.prefix tree_nodes = tree_nodes[:-1] return tree_nodes def copy_nodes(self, tree_nodes, until_line, line_offset): """ Copies tree nodes from the old parser tree. Returns the number of tree nodes that were copied. """ tos = self._get_insertion_node(tree_nodes[0]) new_nodes, self._tos = self._copy_nodes(tos, tree_nodes, until_line, line_offset) return new_nodes def _copy_nodes(self, tos, nodes, until_line, line_offset): new_nodes = [] new_tos = tos for node in nodes: if node.type == 'endmarker': # Endmarkers just distort all the checks below. Remove them. break if node.start_pos[0] > until_line: break # TODO this check might take a bit of time for large files. We # might want to change this to do more intelligent guessing or # binary search. if _get_last_line(node) > until_line: # We can split up functions and classes later. if node.type in ('classdef', 'funcdef') and node.children[-1].type == 'suite': new_nodes.append(node) break new_nodes.append(node) if not new_nodes: return [], tos last_node = new_nodes[-1] line_offset_index = -1 if last_node.type in ('classdef', 'funcdef'): suite = last_node.children[-1] if suite.type == 'suite': suite_tos = _NodesStackNode(suite) # Don't need to pass line_offset here, it's already done by the # parent. suite_nodes, recursive_tos = self._copy_nodes( suite_tos, suite.children, until_line, line_offset) if len(suite_nodes) < 2: # A suite only with newline is not valid. new_nodes.pop() else: suite_tos.parent = tos new_tos = recursive_tos line_offset_index = -2 elif (new_nodes[-1].type in ('error_leaf', 'error_node') or _is_flow_node(new_nodes[-1])): # Error leafs/nodes don't have a defined start/end. Error # nodes might not end with a newline (e.g. if there's an # open `(`). Therefore ignore all of them unless they are # succeeded with valid parser state. # If we copy flows at the end, they might be continued # after the copy limit (in the new parser). # In this while loop we try to remove until we find a newline. new_nodes.pop() while new_nodes: last_node = new_nodes[-1] if last_node.get_last_leaf().type == 'newline': break new_nodes.pop() if new_nodes: try: last_line_offset_leaf = new_nodes[line_offset_index].get_last_leaf() except IndexError: line_offset = 0 # In this case we don't have to calculate an offset, because # there's no children to be managed. last_line_offset_leaf = None tos.add(new_nodes, line_offset, last_line_offset_leaf) return new_nodes, new_tos def _update_tos(self, tree_node): if tree_node.type in ('suite', 'file_input'): self._tos = _NodesStackNode(tree_node, self._tos) self._tos.add(list(tree_node.children)) self._update_tos(tree_node.children[-1]) elif tree_node.type in ('classdef', 'funcdef'): self._update_tos(tree_node.children[-1]) def close(self): while self._tos is not None: self._close_tos() # Add an endmarker. try: last_leaf = self._module.get_last_leaf() end_pos = list(last_leaf.end_pos) except IndexError: end_pos = [1, 0] lines = splitlines(self.prefix) assert len(lines) > 0 if len(lines) == 1: end_pos[1] += len(lines[0]) else: end_pos[0] += len(lines) - 1 end_pos[1] = len(lines[-1]) endmarker = EndMarker('', tuple(end_pos), self.prefix + self._last_prefix) endmarker.parent = self._module self._module.children.append(endmarker)