1
0
forked from VimPlug/jedi
Files
jedi-fork/jedi/parser/diff.py
2017-01-12 08:46:58 +01:00

595 lines
22 KiB
Python

"""
Basically a contains parser that is faster, because it tries to parse only
parts and if anything changes, it only reparses the changed parts.
It works with a simple diff in the beginning and will try to reuse old parser
fragments.
"""
import copy
import re
import difflib
from jedi._compatibility import use_metaclass
from jedi import settings
from jedi.common import splitlines
from jedi.parser import ParserWithRecovery
from jedi.parser.tree import Module, search_ancestor, EndMarker, Flow
from jedi.parser.utils import parser_cache
from jedi import debug
from jedi.parser.tokenize import (generate_tokens, NEWLINE, TokenInfo,
ENDMARKER, INDENT, DEDENT)
class CachedFastParser(type):
""" This is a metaclass for caching `FastParser`. """
def __call__(self, grammar, source, module_path=None):
pi = parser_cache.get(module_path, None)
if pi is None or not settings.fast_parser:
return ParserWithRecovery(grammar, source, module_path)
parser = pi.parser
d = DiffParser(parser)
new_lines = splitlines(source, keepends=True)
parser.module = parser._parsed = d.update(new_lines)
return parser
class FastParser(use_metaclass(CachedFastParser)):
pass
def _merge_used_names(base_dict, other_dict):
for key, names in other_dict.items():
base_dict.setdefault(key, []).extend(names)
def _get_last_line(node_or_leaf):
last_leaf = node_or_leaf.last_leaf()
if last_leaf.type == 'error_leaf':
typ = last_leaf.original_type
else:
typ = last_leaf.type
if typ == 'newline':
return last_leaf.start_pos[0]
else:
return last_leaf.end_pos[0]
def _flows_finished(grammar, stack):
"""
if, while, for and try might not be finished, because another part might
still be parsed.
"""
for dfa, newstate, (symbol_number, nodes) in stack:
if grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt',
'for_stmt', 'try_stmt'):
return False
return True
def suite_or_file_input_is_valid(grammar, stack):
if not _flows_finished(grammar, stack):
return False
for dfa, newstate, (symbol_number, nodes) in reversed(stack):
if grammar.number2symbol[symbol_number] == 'suite':
# If only newline is in the suite, the suite is not valid, yet.
return len(nodes) > 1
# Not reaching a suite means that we're dealing with file_input levels
# where there's no need for a valid statement in it. It can also be empty.
return True
def _is_flow_node(node):
try:
value = node.children[0].value
except AttributeError:
return False
return value in ('if', 'for', 'while', 'try')
def _last_leaf_is_newline(last_leaf):
if last_leaf.prefix.endswith('\n'):
return True
if last_leaf.prefix:
return False
previous_leaf = last_leaf.get_previous_leaf()
return (previous_leaf.type == 'newline' or
previous_leaf.type == 'error_leaf' and
previous_leaf.original_type == 'newline')
def _update_positions(nodes, line_offset):
for node in nodes:
try:
children = node.children
except AttributeError:
# Is a leaf
node.start_pos = node.start_pos[0] + line_offset, node.start_pos[1]
else:
_update_positions(children, line_offset)
class DiffParser(object):
endmarker_type = 'endmarker'
def __init__(self, parser):
self._parser = parser
self._grammar = self._parser._grammar
self._old_module = parser.get_root_node()
def _reset(self):
self._copy_count = 0
self._parser_count = 0
self._parsed_until_line = 0
self._copied_ranges = []
self._old_children = self._old_module.children
self._new_children = []
self._new_module = Module(self._new_children)
self._new_module.path = self._old_module.path
self._new_module.used_names = {}
self._prefix = ''
self._last_prefix = ''
def update(self, lines_new):
'''
The algorithm works as follows:
Equal:
- Assure that the start is a newline, otherwise parse until we get
one.
- Copy from parsed_until_line + 1 to max(i2 + 1)
- Make sure that the indentation is correct (e.g. add DEDENT)
- Add old and change positions
Insert:
- Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not
much more.
Always:
- Set parsed_until_line
Returns the new module node.
'''
self._parser_lines_new = lines_new
self._added_newline = False
if lines_new[-1] != '':
# The Python grammar needs a newline at the end of a file, but for
# everything else we keep working with lines_new here.
self._parser_lines_new = list(lines_new)
self._parser_lines_new[-1] += '\n'
self._added_newline = True
self._reset()
line_length = len(lines_new)
lines_old = splitlines(self._parser.source, keepends=True)
sm = difflib.SequenceMatcher(None, lines_old, self._parser_lines_new)
debug.dbg('diff: line_lengths old: %s, new: %s' % (len(lines_old), line_length))
for operation, i1, i2, j1, j2 in sm.get_opcodes():
debug.dbg('diff %s old[%s:%s] new[%s:%s]',
operation, i1 + 1, i2, j1 + 1, j2)
if j2 == line_length + int(self._added_newline):
# The empty part after the last newline is not relevant.
j2 -= 1
if operation == 'equal':
line_offset = j1 - i1
self._copy_from_old_parser(line_offset, i2, j2)
elif operation == 'replace':
self._parse(until_line=j2)
elif operation == 'insert':
self._parse(until_line=j2)
else:
assert operation == 'delete'
# Cleanup (setting endmarker, used_names)
self._cleanup()
if self._added_newline:
self._parser.module = self._parser._parsed = self._new_module
self._parser.remove_last_newline()
self._parsed_until_line -= 1
self._parser.source = ''.join(lines_new)
self._old_module = self._new_module
assert self._new_module.end_pos[0] == line_length
return self._new_module
def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new):
while until_line_new > self._parsed_until_line:
parsed_until_line_old = self._parsed_until_line - line_offset
line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1)
if line_stmt is None:
# Parse 1 line at least. We don't need more, because we just
# want to get into a state where the old parser has statements
# again that can be copied (e.g. not lines within parentheses).
self._parse(self._parsed_until_line + 1)
else:
p_children = line_stmt.parent.children
index = p_children.index(line_stmt)
# Match all the nodes that are in the wanted range.
nodes = self._divide_nodes(p_children[index:], until_line_old)
if nodes:
self._copy_count += 1
_update_positions(nodes, line_offset)
self._insert_nodes(nodes)
from_ = nodes[0].get_start_pos_of_prefix()[0]
to = _get_last_line(nodes[-1])
self._copied_ranges.append((from_, to))
debug.dbg('diff actually copy %s to %s', from_, to)
# We have copied as much as possible (but definitely not too
# much). Therefore we just parse the rest.
# We might not reach the end, because there's a statement
# that is not finished.
self._parse(until_line_new)
break
def _get_old_line_stmt(self, old_line):
leaf = self._old_module.get_leaf_for_position((old_line, 0), include_prefixes=True)
if leaf.type == 'newline':
leaf = leaf.get_next_leaf()
if leaf.get_start_pos_of_prefix()[0] == old_line:
node = leaf
# TODO use leaf.get_definition one day when that one is working
# well.
while node.parent.type not in ('file_input', 'suite'):
node = node.parent
return node
# Must be on the same line. Otherwise we need to parse that bit.
return None
def _insert_nodes(self, nodes):
"""
Returns the scope that a node is a part of.
"""
# Needs to be done before resetting the parsed
before_node = self._get_before_insertion_node()
last_leaf = nodes[-1].last_leaf()
is_endmarker = last_leaf.type == self.endmarker_type
self._last_prefix = ''
if is_endmarker:
self._parsed_until_line = last_leaf.start_pos[0]
try:
separation = last_leaf.prefix.rindex('\n')
except ValueError:
pass
else:
# Remove the whitespace part of the prefix after a newline.
# That is not relevant if parentheses were opened. Always parse
# until the end of a line.
last_leaf.prefix, self._last_prefix = \
last_leaf.prefix[:separation + 1], last_leaf.prefix[separation + 1:]
if _last_leaf_is_newline(last_leaf):
self._parsed_until_line -= 1
else:
if last_leaf.type == 'newline':
# Newlines end on the next line, which means that they would cover
# the next line. That line is not fully parsed at this point.
self._parsed_until_line = last_leaf.start_pos[0]
else:
self._parsed_until_line = last_leaf.end_pos[0]
debug.dbg('set parsed_until %s', self._parsed_until_line)
first_leaf = nodes[0].first_leaf()
first_leaf.prefix = self._prefix + first_leaf.prefix
self._prefix = ''
if is_endmarker:
self._prefix = last_leaf.prefix
nodes = nodes[:-1]
if not nodes:
return self._new_module
# Now the preparations are done. We are inserting the nodes.
if before_node is None: # Everything is empty.
self._new_children += nodes
new_parent = self._new_module
else:
assert nodes[0].type != 'newline'
line_indentation = nodes[0].start_pos[1]
new_parent = before_node.parent
while True:
p_children = new_parent.children
if new_parent.type == 'suite':
# A suite starts with NEWLINE, ...
indentation = p_children[1].start_pos[1]
else:
indentation = p_children[0].start_pos[1]
if line_indentation < indentation: # Dedent
# We might be at the most outer layer: modules. We
# don't want to depend on the first statement
# having the right indentation.
if new_parent.parent is not None:
new_parent = search_ancestor(
new_parent,
('suite', 'file_input')
)
continue
p_children += nodes
assert new_parent.type in ('suite', 'file_input')
break
# Reset the parents
for node in nodes:
node.parent = new_parent
if new_parent.type == 'suite':
return new_parent.get_parent_scope()
return new_parent
def _get_before_insertion_node(self):
if not self._new_children:
return None
line = self._parsed_until_line + 1
node = self._new_module.last_leaf()
while True:
parent = node.parent
if parent.type in ('suite', 'file_input'):
assert node.end_pos[0] <= line
assert node.end_pos[1] == 0 or '\n' in self._prefix
return node
node = parent
def _divide_node(self, node, until_line):
if node.type not in ('classdef', 'funcdef'):
return None
suite = node.children[-1]
if suite.type != 'suite':
return None
nodes = self._divide_nodes(suite.children, until_line)
if len(nodes) < 2:
# A suite only with newline is not valid.
return None
new_node = copy.copy(node)
new_suite = copy.copy(suite)
# And now set the correct parents
for child in nodes:
child.parent = new_suite
new_suite.children = nodes
new_node.children = list(new_node.children)
new_node.children[-1] = new_suite
for child in new_node.children:
child.parent = new_node
return new_node
def _copy_divided_nodes(self, nodes):
parent = nodes[-1].last_leaf().get_parent_scope()
if parent == nodes[0].get_parent_scope():
check_nodes = nodes
else:
n = parent
while n is not None:
if isinstance(n, Flow):
parent = n.get_parent_scope()
n = n.parent
check_nodes = parent.children
last_node = check_nodes[-1]
if last_node.type == 'suite':
parent = last_node
check_nodes = parent.children
last_node = check_nodes[-1]
drop_node_count = 0
if last_node.type in ('error_leaf', 'error_node') or _is_flow_node(last_node):
# Error leafs/nodes don't have a defined start/end. Error
# nodes might not end with a newline (e.g. if there's an
# open `(`). Therefore ignore all of them unless they are
# succeeded with valid parser state.
# If we copy flows at the end, they might be continued
# after the copy limit (in the new parser).
n = last_node
# In this while loop we try to remove until we find a newline.
while True:
drop_node_count += 1
try:
n = check_nodes[-drop_node_count - 1]
except IndexError:
break
if n.last_leaf().type == 'newline':
break
if drop_node_count:
node = self._drop_last_node(nodes[-1], last_node, drop_node_count)
if node is None:
nodes = nodes[:-drop_node_count]
else:
nodes[-1] = node
return nodes
def _drop_last_node(self, base_node, last_node_to_drop, drop_node_count):
if base_node == last_node_to_drop:
return None
last_node = base_node.children[-1]
child = self._drop_last_node(last_node, last_node_to_drop, drop_node_count)
base_node = copy.copy(base_node)
if child is None:
if base_node.type == 'suite' and len(base_node.children) <= 1 + drop_node_count:
return None
if base_node.type in ('classdef', 'funcdef'):
return None
base_node.children = base_node.children[:-drop_node_count]
else:
base_node.children = list(base_node.children)
base_node.children[-1] = child
child.parent = base_node
for c in base_node.children:
c.parent = base_node
return base_node
def _divide_nodes(self, nodes, until_line):
"""
Breaks up scopes and returns only the part until the given line.
Tries to get the parts it can safely get and ignores the rest.
"""
new_nodes = []
for i, child in enumerate(nodes):
# TODO this check might take a bit of time for large files. We
# might want to change this to do more intelligent guessing or
# binary search.
if _get_last_line(child) > until_line:
node = self._divide_node(child, until_line)
if node is not None:
new_nodes.append(node)
break
else:
new_nodes.append(child)
if new_nodes:
return self._copy_divided_nodes(new_nodes)
return new_nodes
def _parse(self, until_line):
"""
Parses at least until the given line, but might just parse more until a
valid state is reached.
"""
while until_line > self._parsed_until_line:
node = self._try_parse_part(until_line)
nodes = self._get_children_nodes(node)
self._insert_nodes(nodes)
_merge_used_names(
self._new_module.used_names,
node.used_names
)
def _get_children_nodes(self, node):
nodes = node.children
first_element = nodes[0]
# TODO this looks very strange...
if first_element.type == 'error_leaf' and \
first_element.original_type == 'indent':
assert False, str(nodes)
#assert nodes[-1].type == 'dedent'
## This means that the start and end leaf
#nodes = nodes[1:-1] + [nodes[-1]]
return nodes
def _try_parse_part(self, until_line):
"""
Sets up a normal parser that uses a spezialized tokenizer to only parse
until a certain position (or a bit longer if the statement hasn't
ended.
"""
self._parser_count += 1
# TODO speed up, shouldn't copy the whole list all the time.
# memoryview?
lines_after = self._parser_lines_new[self._parsed_until_line:]
#print('parse_content', self._parsed_until_line, lines_after, until_line)
tokenizer = self._diff_tokenize(
lines_after,
until_line,
line_offset=self._parsed_until_line
)
self._active_parser = ParserWithRecovery(
self._grammar,
source='\n',
start_parsing=False
)
return self._active_parser.parse(tokenizer=tokenizer)
def _cleanup(self):
"""Add used names and an end marker."""
# Add the used names from the old parser to the new one.
copied_line_numbers = set()
for l1, l2 in self._copied_ranges:
copied_line_numbers.update(range(l1, l2 + 1))
new_used_names = self._new_module.used_names
for key, names in self._old_module.used_names.items():
for name in names:
if name.start_pos[0] in copied_line_numbers:
new_used_names.setdefault(key, []).append(name)
# Add an endmarker.
try:
last_leaf = self._new_module.last_leaf()
end_pos = list(last_leaf.end_pos)
except IndexError:
end_pos = [1, 0]
lines = splitlines(self._prefix)
assert len(lines) > 0
if len(lines) == 1:
end_pos[1] += len(lines[0])
else:
end_pos[0] += len(lines) - 1
end_pos[1] = len(lines[-1])
endmarker = EndMarker('', tuple(end_pos), self._prefix + self._last_prefix)
endmarker.parent = self._new_module
self._new_children.append(endmarker)
def _diff_tokenize(self, lines, until_line, line_offset=0):
is_first_token = True
omitted_first_indent = False
indents = []
l = iter(lines)
tokens = generate_tokens(lambda: next(l, ''), use_exact_op_types=True)
stack = self._active_parser.pgen_parser.stack
for typ, string, start_pos, prefix in tokens:
start_pos = start_pos[0] + line_offset, start_pos[1]
if typ == INDENT:
indents.append(start_pos[1])
if is_first_token:
omitted_first_indent = True
# We want to get rid of indents that are only here because
# we only parse part of the file. These indents would only
# get parsed as error leafs, which doesn't make any sense.
is_first_token = False
continue
is_first_token = False
if typ == DEDENT:
indents.pop()
if omitted_first_indent and not indents:
# We are done here, only thing that can come now is an
# endmarker or another dedented code block.
typ, string, start_pos, prefix = next(tokens)
if '\n' in prefix:
prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix)
else:
prefix = ''
yield TokenInfo(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix)
break
elif typ == NEWLINE and start_pos[0] >= until_line:
yield TokenInfo(typ, string, start_pos, prefix)
# Check if the parser is actually in a valid suite state.
if suite_or_file_input_is_valid(self._grammar, stack):
start_pos = start_pos[0] + 1, 0
while len(indents) > int(omitted_first_indent):
indents.pop()
yield TokenInfo(DEDENT, '', start_pos, '')
yield TokenInfo(ENDMARKER, '', start_pos, '')
break
else:
continue
yield TokenInfo(typ, string, start_pos, prefix)