Files
parso/parso/python/diff.py

679 lines
25 KiB
Python

"""
Basically a contains parser that is faster, because it tries to parse only
parts and if anything changes, it only reparses the changed parts.
It works with a simple diff in the beginning and will try to reuse old parser
fragments.
"""
import re
import difflib
from collections import namedtuple
import logging
from parso.utils import split_lines
from parso.python.parser import Parser
from parso.python.tree import EndMarker
from parso.python.tokenize import PythonToken
from parso.python.token import PythonTokenTypes
LOG = logging.getLogger(__name__)
DEBUG_DIFF_PARSER = False
def _assert_valid_graph(node):
"""
Checks if the parent/children relationship is correct.
"""
try:
children = node.children
except AttributeError:
previous_leaf = node.get_previous_leaf()
# Ignore INDENT is necessary, because indent/dedent tokens don't
# contain value/prefix and are just around, because of the tokenizer.
if node.type == 'error_leaf' and node.token_type in ('INDENT', 'ERROR_DEDENT'):
return
while previous_leaf and previous_leaf.type == 'error_leaf' \
and previous_leaf.token_type in ('INDENT', 'ERROR_DEDENT'):
assert previous_leaf.end_pos <= node.start_pos, \
(previous_leaf, node)
previous_leaf = previous_leaf.get_previous_leaf()
# Calculate the content between two start positions.
if previous_leaf is None:
content = node.prefix
previous_start_pos = 1, 0
else:
assert previous_leaf.end_pos <= node.start_pos, \
(previous_leaf, node)
content = previous_leaf.value + node.prefix
previous_start_pos = previous_leaf.start_pos
if '\n' in content:
splitted = split_lines(content)
line = previous_start_pos[0] + len(splitted) - 1
actual = line, len(splitted[-1])
else:
actual = previous_start_pos[0], previous_start_pos[1] + len(content)
assert node.start_pos == actual, (node.start_pos, actual)
else:
for child in children:
assert child.parent == node, (node, child)
_assert_valid_graph(child)
def _get_debug_error_message(module, old_lines, new_lines):
current_lines = split_lines(module.get_code(), keepends=True)
current_diff = difflib.unified_diff(new_lines, current_lines)
old_new_diff = difflib.unified_diff(old_lines, new_lines)
import parso
return (
"There's an issue with the diff parser. Please "
"report (parso v%s) - Old/New:\n%s\nActual Diff (May be empty):\n%s"
% (parso.__version__, ''.join(old_new_diff), ''.join(current_diff))
)
def _get_last_line(node_or_leaf):
last_leaf = node_or_leaf.get_last_leaf()
if _ends_with_newline(last_leaf):
return last_leaf.start_pos[0]
else:
return last_leaf.end_pos[0]
def _ends_with_newline(leaf, suffix=''):
if leaf.type == 'error_leaf':
typ = leaf.token_type.lower()
else:
typ = leaf.type
return typ == 'newline' or suffix.endswith('\n')
def _flows_finished(pgen_grammar, stack):
"""
if, while, for and try might not be finished, because another part might
still be parsed.
"""
for stack_node in stack:
if stack_node.nonterminal in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'):
return False
return True
def _func_or_class_has_suite(node):
if node.type == 'decorated':
node = node.children[-1]
if node.type in ('async_funcdef', 'async_stmt'):
node = node.children[-1]
return node.type in ('classdef', 'funcdef') and node.children[-1].type == 'suite'
def suite_or_file_input_is_valid(pgen_grammar, stack):
if not _flows_finished(pgen_grammar, stack):
return False
for stack_node in reversed(stack):
if stack_node.nonterminal == 'suite':
# If only newline is in the suite, the suite is not valid, yet.
return len(stack_node.nodes) > 1
# Not reaching a suite means that we're dealing with file_input levels
# where there's no need for a valid statement in it. It can also be empty.
return True
def _is_flow_node(node):
try:
value = node.children[0].value
except AttributeError:
return False
return value in ('if', 'for', 'while', 'try')
class _PositionUpdatingFinished(Exception):
pass
def _update_positions(nodes, line_offset, last_leaf):
for node in nodes:
try:
children = node.children
except AttributeError:
# Is a leaf
node.line += line_offset
if node is last_leaf:
raise _PositionUpdatingFinished
else:
_update_positions(children, line_offset, last_leaf)
class DiffParser(object):
"""
An advanced form of parsing a file faster. Unfortunately comes with huge
side effects. It changes the given module.
"""
def __init__(self, pgen_grammar, tokenizer, module):
self._pgen_grammar = pgen_grammar
self._tokenizer = tokenizer
self._module = module
def _reset(self):
self._copy_count = 0
self._parser_count = 0
self._nodes_tree = _NodesTree(self._module)
def update(self, old_lines, new_lines):
'''
The algorithm works as follows:
Equal:
- Assure that the start is a newline, otherwise parse until we get
one.
- Copy from parsed_until_line + 1 to max(i2 + 1)
- Make sure that the indentation is correct (e.g. add DEDENT)
- Add old and change positions
Insert:
- Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not
much more.
Returns the new module node.
'''
LOG.debug('diff parser start')
# Reset the used names cache so they get regenerated.
self._module._used_names = None
self._parser_lines_new = new_lines
self._reset()
line_length = len(new_lines)
sm = difflib.SequenceMatcher(None, old_lines, self._parser_lines_new)
opcodes = sm.get_opcodes()
LOG.debug('line_lengths old: %s; new: %s' % (len(old_lines), line_length))
for operation, i1, i2, j1, j2 in opcodes:
LOG.debug('-> code[%s] old[%s:%s] new[%s:%s]',
operation, i1 + 1, i2, j1 + 1, j2)
if j2 == line_length and new_lines[-1] == '':
# The empty part after the last newline is not relevant.
j2 -= 1
if operation == 'equal':
line_offset = j1 - i1
self._copy_from_old_parser(line_offset, i2, j2)
elif operation == 'replace':
self._parse(until_line=j2)
elif operation == 'insert':
self._parse(until_line=j2)
else:
assert operation == 'delete'
# With this action all change will finally be applied and we have a
# changed module.
self._nodes_tree.close()
last_pos = self._module.end_pos[0]
if last_pos != line_length:
raise Exception(
('(%s != %s) ' % (last_pos, line_length))
+ _get_debug_error_message(self._module, old_lines, new_lines)
)
if DEBUG_DIFF_PARSER:
# If there is reasonable suspicion that the diff parser is not
# behaving well, this should be enabled.
try:
assert self._module.get_code() == ''.join(new_lines)
_assert_valid_graph(self._module)
except AssertionError:
print(_get_debug_error_message(self._module, old_lines, new_lines))
raise
LOG.debug('diff parser end')
return self._module
def _enabled_debugging(self, old_lines, lines_new):
if self._module.get_code() != ''.join(lines_new):
LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new))
def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new):
copied_nodes = [None]
last_until_line = -1
while until_line_new > self._nodes_tree.parsed_until_line:
parsed_until_line_old = self._nodes_tree.parsed_until_line - line_offset
line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1)
if line_stmt is None:
# Parse 1 line at least. We don't need more, because we just
# want to get into a state where the old parser has statements
# again that can be copied (e.g. not lines within parentheses).
self._parse(self._nodes_tree.parsed_until_line + 1)
elif not copied_nodes:
# We have copied as much as possible (but definitely not too
# much). Therefore we just parse the rest.
# We might not reach the end, because there's a statement
# that is not finished.
self._parse(until_line_new)
else:
p_children = line_stmt.parent.children
index = p_children.index(line_stmt)
from_ = self._nodes_tree.parsed_until_line + 1
copied_nodes = self._nodes_tree.copy_nodes(
p_children[index:],
until_line_old,
line_offset
)
# Match all the nodes that are in the wanted range.
if copied_nodes:
self._copy_count += 1
to = self._nodes_tree.parsed_until_line
LOG.debug('copy old[%s:%s] new[%s:%s]',
copied_nodes[0].start_pos[0],
copied_nodes[-1].end_pos[0] - 1, from_, to)
# Since there are potential bugs that might loop here endlessly, we
# just stop here.
assert last_until_line != self._nodes_tree.parsed_until_line \
or not copied_nodes, last_until_line
last_until_line = self._nodes_tree.parsed_until_line
def _get_old_line_stmt(self, old_line):
leaf = self._module.get_leaf_for_position((old_line, 0), include_prefixes=True)
if _ends_with_newline(leaf):
leaf = leaf.get_next_leaf()
if leaf.get_start_pos_of_prefix()[0] == old_line:
node = leaf
while node.parent.type not in ('file_input', 'suite'):
node = node.parent
# Make sure that if only the `else:` line of an if statement is
# copied that not the whole thing is going to be copied.
if node.start_pos[0] >= old_line:
return node
# Must be on the same line. Otherwise we need to parse that bit.
return None
def _parse(self, until_line):
"""
Parses at least until the given line, but might just parse more until a
valid state is reached.
"""
last_until_line = 0
while until_line > self._nodes_tree.parsed_until_line:
node = self._try_parse_part(until_line)
nodes = node.children
self._nodes_tree.add_parsed_nodes(nodes)
LOG.debug(
'parse_part from %s to %s (to %s in part parser)',
nodes[0].get_start_pos_of_prefix()[0],
self._nodes_tree.parsed_until_line,
node.end_pos[0] - 1
)
# Since the tokenizer sometimes has bugs, we cannot be sure that
# this loop terminates. Therefore assert that there's always a
# change.
assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line
last_until_line = self._nodes_tree.parsed_until_line
def _try_parse_part(self, until_line):
"""
Sets up a normal parser that uses a spezialized tokenizer to only parse
until a certain position (or a bit longer if the statement hasn't
ended.
"""
self._parser_count += 1
# TODO speed up, shouldn't copy the whole list all the time.
# memoryview?
parsed_until_line = self._nodes_tree.parsed_until_line
lines_after = self._parser_lines_new[parsed_until_line:]
tokens = self._diff_tokenize(
lines_after,
until_line,
line_offset=parsed_until_line
)
self._active_parser = Parser(
self._pgen_grammar,
error_recovery=True
)
return self._active_parser.parse(tokens=tokens)
def _diff_tokenize(self, lines, until_line, line_offset=0):
is_first_token = True
omitted_first_indent = False
indents = []
tokens = self._tokenizer(lines, (1, 0))
stack = self._active_parser.stack
for typ, string, start_pos, prefix in tokens:
start_pos = start_pos[0] + line_offset, start_pos[1]
if typ == PythonTokenTypes.INDENT:
indents.append(start_pos[1])
if is_first_token:
omitted_first_indent = True
# We want to get rid of indents that are only here because
# we only parse part of the file. These indents would only
# get parsed as error leafs, which doesn't make any sense.
is_first_token = False
continue
is_first_token = False
# In case of omitted_first_indent, it might not be dedented fully.
# However this is a sign for us that a dedent happened.
if typ == PythonTokenTypes.DEDENT \
or typ == PythonTokenTypes.ERROR_DEDENT \
and omitted_first_indent and len(indents) == 1:
indents.pop()
if omitted_first_indent and not indents:
# We are done here, only thing that can come now is an
# endmarker or another dedented code block.
typ, string, start_pos, prefix = next(tokens)
if '\n' in prefix:
prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix)
else:
prefix = ''
yield PythonToken(
PythonTokenTypes.ENDMARKER, '',
(start_pos[0] + line_offset, 0),
prefix
)
break
elif typ == PythonTokenTypes.NEWLINE and start_pos[0] >= until_line:
yield PythonToken(typ, string, start_pos, prefix)
# Check if the parser is actually in a valid suite state.
if suite_or_file_input_is_valid(self._pgen_grammar, stack):
start_pos = start_pos[0] + 1, 0
while len(indents) > int(omitted_first_indent):
indents.pop()
yield PythonToken(PythonTokenTypes.DEDENT, '', start_pos, '')
yield PythonToken(PythonTokenTypes.ENDMARKER, '', start_pos, '')
break
else:
continue
yield PythonToken(typ, string, start_pos, prefix)
class _NodesTreeNode(object):
_ChildrenGroup = namedtuple('_ChildrenGroup', 'prefix children line_offset last_line_offset_leaf')
def __init__(self, tree_node, parent=None):
self.tree_node = tree_node
self._children_groups = []
self.parent = parent
self._node_children = []
def finish(self):
children = []
for prefix, children_part, line_offset, last_line_offset_leaf in self._children_groups:
first_leaf = children_part[0].get_first_leaf()
first_leaf.prefix = prefix + first_leaf.prefix
if line_offset != 0:
try:
_update_positions(
children_part, line_offset, last_line_offset_leaf)
except _PositionUpdatingFinished:
pass
children += children_part
self.tree_node.children = children
# Reset the parents
for node in children:
node.parent = self.tree_node
for node_child in self._node_children:
node_child.finish()
def add_child_node(self, child_node):
self._node_children.append(child_node)
def add_tree_nodes(self, prefix, children, line_offset=0, last_line_offset_leaf=None):
if last_line_offset_leaf is None:
last_line_offset_leaf = children[-1].get_last_leaf()
group = self._ChildrenGroup(prefix, children, line_offset, last_line_offset_leaf)
self._children_groups.append(group)
def get_last_line(self, suffix):
line = 0
if self._children_groups:
children_group = self._children_groups[-1]
last_leaf = children_group.last_line_offset_leaf
line = last_leaf.end_pos[0] + children_group.line_offset
# Newlines end on the next line, which means that they would cover
# the next line. That line is not fully parsed at this point.
if _ends_with_newline(last_leaf, suffix):
line -= 1
line += suffix.count('\n')
if suffix and not suffix.endswith('\n'):
# This is the end of a file (that doesn't end with a newline).
line += 1
if self._node_children:
return max(line, self._node_children[-1].get_last_line(suffix))
return line
class _NodesTree(object):
endmarker_type = 'endmarker'
def __init__(self, module):
self._base_node = _NodesTreeNode(module)
self._working_stack = [self._base_node]
self._module = module
self._prefix_remainder = ''
self.prefix = ''
@property
def parsed_until_line(self):
return self._working_stack[-1].get_last_line(self.prefix)
def _get_insertion_node(self, indentation_node):
indentation = indentation_node.start_pos[1]
# find insertion node
while True:
node = self._working_stack[-1]
tree_node = node.tree_node
if tree_node.type == 'suite':
# A suite starts with NEWLINE, ...
node_indentation = tree_node.children[1].start_pos[1]
if indentation >= node_indentation: # Not a Dedent
# We might be at the most outer layer: modules. We
# don't want to depend on the first statement
# having the right indentation.
return node
elif tree_node.type == 'file_input':
return node
self._working_stack.pop()
def add_parsed_nodes(self, tree_nodes):
old_prefix = self.prefix
tree_nodes = self._remove_endmarker(tree_nodes)
if not tree_nodes:
self.prefix = old_prefix + self.prefix
return
assert tree_nodes[0].type != 'newline'
node = self._get_insertion_node(tree_nodes[0])
assert node.tree_node.type in ('suite', 'file_input')
node.add_tree_nodes(old_prefix, tree_nodes)
# tos = Top of stack
self._update_tos(tree_nodes[-1])
def _update_tos(self, tree_node):
if tree_node.type in ('suite', 'file_input'):
new_tos = _NodesTreeNode(tree_node)
new_tos.add_tree_nodes('', list(tree_node.children))
self._working_stack[-1].add_child_node(new_tos)
self._working_stack.append(new_tos)
self._update_tos(tree_node.children[-1])
elif _func_or_class_has_suite(tree_node):
self._update_tos(tree_node.children[-1])
def _remove_endmarker(self, tree_nodes):
"""
Helps cleaning up the tree nodes that get inserted.
"""
last_leaf = tree_nodes[-1].get_last_leaf()
is_endmarker = last_leaf.type == self.endmarker_type
self._prefix_remainder = ''
if is_endmarker:
try:
separation = last_leaf.prefix.rindex('\n') + 1
except ValueError:
pass
else:
# Remove the whitespace part of the prefix after a newline.
# That is not relevant if parentheses were opened. Always parse
# until the end of a line.
last_leaf.prefix, self._prefix_remainder = \
last_leaf.prefix[:separation], last_leaf.prefix[separation:]
self.prefix = ''
if is_endmarker:
self.prefix = last_leaf.prefix
tree_nodes = tree_nodes[:-1]
return tree_nodes
def copy_nodes(self, tree_nodes, until_line, line_offset):
"""
Copies tree nodes from the old parser tree.
Returns the number of tree nodes that were copied.
"""
self._get_insertion_node(tree_nodes[0])
new_nodes, self._working_stack = self._copy_nodes(
list(self._working_stack),
tree_nodes,
until_line,
line_offset,
self.prefix,
)
return new_nodes
def _copy_nodes(self, working_stack, nodes, until_line, line_offset, prefix=''):
new_nodes = []
new_prefix = ''
for node in nodes:
if node.start_pos[0] > until_line:
break
if node.type == 'endmarker':
# We basically removed the endmarker, but we are not allowed to
# remove the newline at the end of the line, otherwise it's
# going to be missing.
try:
new_prefix = node.prefix[:node.prefix.rindex('\n') + 1]
except ValueError:
pass
# Endmarkers just distort all the checks below. Remove them.
break
# TODO this check might take a bit of time for large files. We
# might want to change this to do more intelligent guessing or
# binary search.
if _get_last_line(node) > until_line:
# We can split up functions and classes later.
if _func_or_class_has_suite(node):
new_nodes.append(node)
break
new_nodes.append(node)
if not new_nodes:
return [], working_stack
tos = working_stack[-1]
last_node = new_nodes[-1]
had_valid_suite_last = False
if _func_or_class_has_suite(last_node):
suite = last_node
while suite.type != 'suite':
suite = suite.children[-1]
suite_tos = _NodesTreeNode(suite)
# Don't need to pass line_offset here, it's already done by the
# parent.
suite_nodes, new_working_stack = self._copy_nodes(
working_stack + [suite_tos], suite.children, until_line, line_offset
)
if len(suite_nodes) < 2:
# A suite only with newline is not valid.
new_nodes.pop()
else:
assert new_nodes
tos.add_child_node(suite_tos)
working_stack = new_working_stack
had_valid_suite_last = True
elif (last_node.type in ('error_leaf', 'error_node') or
_is_flow_node(new_nodes[-1])):
# Error leafs/nodes don't have a defined start/end. Error
# nodes might not end with a newline (e.g. if there's an
# open `(`). Therefore ignore all of them unless they are
# succeeded with valid parser state.
# If we copy flows at the end, they might be continued
# after the copy limit (in the new parser).
# In this while loop we try to remove until we find a newline.
new_nodes.pop()
while new_nodes:
last_node = new_nodes[-1]
if last_node.get_last_leaf().type == 'newline':
break
new_nodes.pop()
if new_nodes:
if had_valid_suite_last:
last = new_nodes[-1]
if last.type == 'decorated':
last = last.children[-1]
last_line_offset_leaf = last.children[-2].get_last_leaf()
assert last_line_offset_leaf == ':'
else:
last_line_offset_leaf = new_nodes[-1].get_last_leaf()
tos.add_tree_nodes(prefix, new_nodes, line_offset, last_line_offset_leaf)
self.prefix = new_prefix
self._prefix_remainder = ''
return new_nodes, working_stack
def close(self):
self._base_node.finish()
# Add an endmarker.
try:
last_leaf = self._module.get_last_leaf()
end_pos = list(last_leaf.end_pos)
except IndexError:
end_pos = [1, 0]
lines = split_lines(self.prefix)
assert len(lines) > 0
if len(lines) == 1:
end_pos[1] += len(lines[0])
else:
end_pos[0] += len(lines) - 1
end_pos[1] = len(lines[-1])
endmarker = EndMarker('', tuple(end_pos), self.prefix + self._prefix_remainder)
endmarker.parent = self._module
self._module.children.append(endmarker)