mirror of
https://github.com/davidhalter/jedi.git
synced 2025-12-09 15:24:46 +08:00
Some ideas for a diff parser.
This commit is contained in:
@@ -107,13 +107,13 @@ class Parser(object):
|
|||||||
# For the fast parser.
|
# For the fast parser.
|
||||||
self.position_modifier = pt.PositionModifier()
|
self.position_modifier = pt.PositionModifier()
|
||||||
|
|
||||||
|
self.source = source
|
||||||
self._added_newline = False
|
self._added_newline = False
|
||||||
# The Python grammar needs a newline at the end of each statement.
|
# The Python grammar needs a newline at the end of each statement.
|
||||||
if not source.endswith('\n') and start_symbol == 'file_input':
|
if not source.endswith('\n') and start_symbol == 'file_input':
|
||||||
source += '\n'
|
source += '\n'
|
||||||
self._added_newline = True
|
self._added_newline = True
|
||||||
|
|
||||||
self.source = source
|
|
||||||
self._start_symbol = start_symbol
|
self._start_symbol = start_symbol
|
||||||
self._grammar = grammar
|
self._grammar = grammar
|
||||||
|
|
||||||
@@ -129,15 +129,12 @@ class Parser(object):
|
|||||||
return self._parsed
|
return self._parsed
|
||||||
|
|
||||||
start_number = self._grammar.symbol2number[self._start_symbol]
|
start_number = self._grammar.symbol2number[self._start_symbol]
|
||||||
pgen_parser = PgenParser(
|
self.pgen_parser = PgenParser(
|
||||||
self._grammar, self.convert_node, self.convert_leaf,
|
self._grammar, self.convert_node, self.convert_leaf,
|
||||||
self.error_recovery, start_number
|
self.error_recovery, start_number
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
self._parsed = self.pgen_parser.parse(tokenizer)
|
||||||
self._parsed = pgen_parser.parse(tokenizer)
|
|
||||||
finally:
|
|
||||||
self.stack = pgen_parser.stack
|
|
||||||
|
|
||||||
if self._start_symbol == 'file_input' != self._parsed.type:
|
if self._start_symbol == 'file_input' != self._parsed.type:
|
||||||
# If there's only one statement, we get back a non-module. That's
|
# If there's only one statement, we get back a non-module. That's
|
||||||
@@ -148,9 +145,15 @@ class Parser(object):
|
|||||||
|
|
||||||
if self._added_newline:
|
if self._added_newline:
|
||||||
self.remove_last_newline()
|
self.remove_last_newline()
|
||||||
|
# The stack is empty now, we don't need it anymore.
|
||||||
|
del self.pgen_parser
|
||||||
|
return self._parsed
|
||||||
|
|
||||||
def get_parsed_node(self):
|
def get_parsed_node(self):
|
||||||
# TODO rename to get_root_node
|
# TODO remove in favor of get_root_node
|
||||||
|
return self._parsed
|
||||||
|
|
||||||
|
def get_root_node(self):
|
||||||
return self._parsed
|
return self._parsed
|
||||||
|
|
||||||
def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix,
|
def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix,
|
||||||
|
|||||||
@@ -5,617 +5,204 @@ finished (and still not working as I want), I won't document it any further.
|
|||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
import difflib
|
||||||
|
|
||||||
from jedi._compatibility import use_metaclass
|
from jedi._compatibility import use_metaclass
|
||||||
from jedi import settings
|
from jedi import settings
|
||||||
|
from jedi.common import splitlines
|
||||||
from jedi.parser import ParserWithRecovery
|
from jedi.parser import ParserWithRecovery
|
||||||
from jedi.parser import tree
|
from jedi.parser import tree
|
||||||
from jedi.parser.utils import underscore_memoization, parser_cache
|
from jedi.parser.utils import underscore_memoization, parser_cache
|
||||||
|
from jedi.parser import tokenize
|
||||||
from jedi import debug
|
from jedi import debug
|
||||||
from jedi.parser.tokenize import (source_tokens, NEWLINE,
|
from jedi.parser.tokenize import (generate_tokens, NEWLINE,
|
||||||
ENDMARKER, INDENT, DEDENT)
|
ENDMARKER, INDENT, DEDENT)
|
||||||
|
|
||||||
FLOWS = 'if', 'else', 'elif', 'while', 'with', 'try', 'except', 'finally', 'for'
|
|
||||||
|
|
||||||
|
|
||||||
class FastModule(tree.Module):
|
|
||||||
type = 'file_input'
|
|
||||||
|
|
||||||
def __init__(self, module_path):
|
|
||||||
super(FastModule, self).__init__([])
|
|
||||||
self.modules = []
|
|
||||||
self.reset_caches()
|
|
||||||
self.names_dict = {}
|
|
||||||
self.path = module_path
|
|
||||||
|
|
||||||
def reset_caches(self):
|
|
||||||
self.modules = []
|
|
||||||
try:
|
|
||||||
del self._used_names # Remove the used names cache.
|
|
||||||
except AttributeError:
|
|
||||||
pass # It was never used.
|
|
||||||
|
|
||||||
@property
|
|
||||||
@underscore_memoization
|
|
||||||
def used_names(self):
|
|
||||||
return MergedNamesDict([m.used_names for m in self.modules])
|
|
||||||
|
|
||||||
@property
|
|
||||||
def global_names(self):
|
|
||||||
return [name for m in self.modules for name in m.global_names]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def error_statements(self):
|
|
||||||
return [e for m in self.modules for e in m.error_statements]
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "<fast.%s: %s@%s-%s>" % (type(self).__name__, self.name,
|
|
||||||
self.start_pos[0], self.end_pos[0])
|
|
||||||
|
|
||||||
# To avoid issues with with the `parser.ParserWithRecovery`, we need
|
|
||||||
# setters that do nothing, because if pickle comes along and sets those
|
|
||||||
# values.
|
|
||||||
@global_names.setter
|
|
||||||
def global_names(self, value):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@error_statements.setter
|
|
||||||
def error_statements(self, value):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@used_names.setter
|
|
||||||
def used_names(self, value):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class MergedNamesDict(object):
|
|
||||||
def __init__(self, dicts):
|
|
||||||
self.dicts = dicts
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return iter(set(key for dct in self.dicts for key in dct))
|
|
||||||
|
|
||||||
def __getitem__(self, value):
|
|
||||||
return list(chain.from_iterable(dct.get(value, []) for dct in self.dicts))
|
|
||||||
|
|
||||||
def items(self):
|
|
||||||
dct = {}
|
|
||||||
for d in self.dicts:
|
|
||||||
for key, values in d.items():
|
|
||||||
try:
|
|
||||||
dct_values = dct[key]
|
|
||||||
dct_values += values
|
|
||||||
except KeyError:
|
|
||||||
dct[key] = list(values)
|
|
||||||
return dct.items()
|
|
||||||
|
|
||||||
def values(self):
|
|
||||||
lst = []
|
|
||||||
for dct in self.dicts:
|
|
||||||
lst += dct.values()
|
|
||||||
return lst
|
|
||||||
|
|
||||||
|
|
||||||
class CachedFastParser(type):
|
class CachedFastParser(type):
|
||||||
""" This is a metaclass for caching `FastParser`. """
|
""" This is a metaclass for caching `FastParser`. """
|
||||||
def __call__(self, grammar, source, module_path=None):
|
def __call__(self, grammar, source, module_path=None):
|
||||||
if not settings.fast_parser:
|
pi = parser_cache.get(module_path, None)
|
||||||
|
if pi is None or not settings.fast_parser:
|
||||||
return ParserWithRecovery(grammar, source, module_path)
|
return ParserWithRecovery(grammar, source, module_path)
|
||||||
|
|
||||||
pi = parser_cache.get(module_path, None)
|
parser = pi.parser
|
||||||
if pi is None or isinstance(pi.parser, ParserWithRecovery):
|
d = DiffParser(parser)
|
||||||
p = super(CachedFastParser, self).__call__(grammar, source, module_path)
|
d.update(splitlines(source, keepends=True))
|
||||||
else:
|
return parser
|
||||||
p = pi.parser # pi is a `cache.ParserCacheItem`
|
|
||||||
p.update(source)
|
|
||||||
return p
|
|
||||||
|
|
||||||
|
|
||||||
class ParserNode(object):
|
|
||||||
def __init__(self, fast_module, parser, source):
|
|
||||||
self._fast_module = fast_module
|
|
||||||
self.parent = None
|
|
||||||
self._node_children = []
|
|
||||||
|
|
||||||
self.source = source
|
|
||||||
self.hash = hash(source)
|
|
||||||
self.parser = parser
|
|
||||||
if source:
|
|
||||||
self._end_pos = parser.module.end_pos
|
|
||||||
else:
|
|
||||||
self._end_pos = 1, 0
|
|
||||||
|
|
||||||
try:
|
|
||||||
# With fast_parser we have either 1 subscope or only statements.
|
|
||||||
self._content_scope = parser.module.subscopes[0]
|
|
||||||
# A parsed node's content will be in the first indent, because
|
|
||||||
# everything that's parsed is within this subscope.
|
|
||||||
self._is_class_or_def = True
|
|
||||||
except IndexError:
|
|
||||||
self._content_scope = parser.module
|
|
||||||
self._is_class_or_def = False
|
|
||||||
else:
|
|
||||||
self._rewrite_last_newline()
|
|
||||||
|
|
||||||
# We need to be able to reset the original children of a parser.
|
|
||||||
self._old_children = list(self._content_scope.children)
|
|
||||||
|
|
||||||
def is_root_node(self):
|
|
||||||
return self.parent is None
|
|
||||||
|
|
||||||
def _rewrite_last_newline(self):
|
|
||||||
"""
|
|
||||||
The ENDMARKER can contain a newline in the prefix. However this prefix
|
|
||||||
really belongs to the function - respectively to the next function or
|
|
||||||
parser node. If we don't rewrite that newline, we end up with a newline
|
|
||||||
in the wrong position, i.d. at the end of the file instead of in the
|
|
||||||
middle.
|
|
||||||
"""
|
|
||||||
c = self._content_scope.children
|
|
||||||
if tree.is_node(c[-1], 'suite'): # In a simple_stmt there's no DEDENT.
|
|
||||||
end_marker = self.parser.module.children[-1]
|
|
||||||
# Set the DEDENT prefix instead of the ENDMARKER.
|
|
||||||
c[-1].children[-1].prefix = end_marker.prefix
|
|
||||||
end_marker.prefix = ''
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
module = self.parser.module
|
|
||||||
try:
|
|
||||||
return '<%s: %s-%s>' % (type(self).__name__, module.start_pos, module.end_pos)
|
|
||||||
except IndexError:
|
|
||||||
# There's no module yet.
|
|
||||||
return '<%s: empty>' % type(self).__name__
|
|
||||||
|
|
||||||
@property
|
|
||||||
def end_pos(self):
|
|
||||||
return self._end_pos[0] + self.parser.position_modifier.line, self._end_pos[1]
|
|
||||||
|
|
||||||
def reset_node(self):
|
|
||||||
"""
|
|
||||||
Removes changes that were applied in this class.
|
|
||||||
"""
|
|
||||||
self._node_children = []
|
|
||||||
scope = self._content_scope
|
|
||||||
scope.children = list(self._old_children)
|
|
||||||
try:
|
|
||||||
# This works if it's a MergedNamesDict.
|
|
||||||
# We are correcting it, because the MergedNamesDicts are artificial
|
|
||||||
# and can change after closing a node.
|
|
||||||
scope.names_dict = scope.names_dict.dicts[0]
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
"""
|
|
||||||
Closes the current parser node. This means that after this no further
|
|
||||||
nodes should be added anymore.
|
|
||||||
"""
|
|
||||||
# We only need to replace the dict if multiple dictionaries are used:
|
|
||||||
if self._node_children:
|
|
||||||
dcts = [n.parser.module.names_dict for n in self._node_children]
|
|
||||||
# Need to insert the own node as well.
|
|
||||||
dcts.insert(0, self._content_scope.names_dict)
|
|
||||||
self._content_scope.names_dict = MergedNamesDict(dcts)
|
|
||||||
endmarker = self.parser.get_parsed_node().children[-1]
|
|
||||||
assert endmarker.type == 'endmarker'
|
|
||||||
last_parser = self._node_children[-1].parser
|
|
||||||
endmarker.start_pos = last_parser.get_parsed_node().end_pos
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _indent(self):
|
|
||||||
if self.is_root_node():
|
|
||||||
return 0
|
|
||||||
|
|
||||||
return self.parser.module.children[0].start_pos[1]
|
|
||||||
|
|
||||||
def add_node(self, node, start_line, indent):
|
|
||||||
"""
|
|
||||||
Adding a node means adding a node that was either just parsed or one
|
|
||||||
that can be reused.
|
|
||||||
"""
|
|
||||||
# Content that is not a subscope can never be part of the current node,
|
|
||||||
# because it's basically a sister node, that sits next to it and not
|
|
||||||
# within it.
|
|
||||||
if (self._indent >= indent or not self._is_class_or_def) and \
|
|
||||||
not self.is_root_node():
|
|
||||||
self.close()
|
|
||||||
return self.parent.add_node(node, start_line, indent)
|
|
||||||
|
|
||||||
# Changing the line offsets is very important, because if they don't
|
|
||||||
# fit, all the start_pos values will be wrong.
|
|
||||||
m = node.parser.module
|
|
||||||
node.parser.position_modifier.line = start_line - 1
|
|
||||||
self._fast_module.modules.append(m)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
self._node_children.append(node)
|
|
||||||
|
|
||||||
# Insert parser objects into current structure. We only need to set the
|
|
||||||
# parents and children in a good way.
|
|
||||||
scope = self._content_scope
|
|
||||||
for child in m.children:
|
|
||||||
child.parent = scope
|
|
||||||
scope.children.append(child)
|
|
||||||
|
|
||||||
return node
|
|
||||||
|
|
||||||
def all_sub_nodes(self):
|
|
||||||
"""
|
|
||||||
Returns all nodes including nested ones.
|
|
||||||
"""
|
|
||||||
for n in self._node_children:
|
|
||||||
yield n
|
|
||||||
for y in n.all_sub_nodes():
|
|
||||||
yield y
|
|
||||||
|
|
||||||
@underscore_memoization # Should only happen once!
|
|
||||||
def remove_last_newline(self):
|
|
||||||
self.parser.remove_last_newline()
|
|
||||||
|
|
||||||
|
|
||||||
class FastParser(use_metaclass(CachedFastParser)):
|
class FastParser(use_metaclass(CachedFastParser)):
|
||||||
_FLOWS_NEED_SPACE = 'if', 'elif', 'while', 'with', 'except', 'for'
|
pass
|
||||||
_FLOWS_NEED_COLON = 'else', 'try', 'except', 'finally'
|
|
||||||
_keyword_re = re.compile('^[ \t]*(def |class |@|(?:%s)|(?:%s)\s*:)'
|
|
||||||
% ('|'.join(_FLOWS_NEED_SPACE),
|
|
||||||
'|'.join(_FLOWS_NEED_COLON)))
|
|
||||||
|
|
||||||
def __init__(self, grammar, source, module_path=None):
|
|
||||||
# set values like `tree.Module`.
|
|
||||||
self._grammar = grammar
|
|
||||||
self.module_path = module_path
|
|
||||||
self._reset_caches()
|
|
||||||
self.update(source)
|
|
||||||
|
|
||||||
def _reset_caches(self):
|
class DiffParser():
|
||||||
self.module = FastModule(self.module_path)
|
def __init__(self, parser):
|
||||||
self.root_node = self.current_node = ParserNode(self.module, self, '')
|
self._parser = parser
|
||||||
|
self._module = parser.get_root_node()
|
||||||
|
|
||||||
def get_parsed_node(self):
|
def _reset(self):
|
||||||
return self.module
|
self._delete_count = 0
|
||||||
|
self._insert_count = 0
|
||||||
|
|
||||||
def update(self, source):
|
self._parsed_until_line = 0
|
||||||
# Variables for testing purposes: It is important that the number of
|
|
||||||
# parsers used can be minimized. With these variables we can test
|
|
||||||
# against that.
|
|
||||||
self.number_parsers_used = 0
|
|
||||||
self.number_of_splits = 0
|
|
||||||
self.number_of_misses = 0
|
|
||||||
self.module.reset_caches()
|
|
||||||
self.source = source
|
|
||||||
try:
|
|
||||||
self._parse(source)
|
|
||||||
except:
|
|
||||||
# FastParser is cached, be careful with exceptions.
|
|
||||||
self._reset_caches()
|
|
||||||
raise
|
|
||||||
|
|
||||||
def _split_parts(self, source):
|
def update(self, lines_new):
|
||||||
"""
|
'''
|
||||||
Split the source code into different parts. This makes it possible to
|
The algorithm works as follows:
|
||||||
parse each part seperately and therefore cache parts of the file and
|
|
||||||
not everything.
|
|
||||||
"""
|
|
||||||
def gen_part():
|
|
||||||
text = ''.join(current_lines)
|
|
||||||
del current_lines[:]
|
|
||||||
self.number_of_splits += 1
|
|
||||||
return text
|
|
||||||
|
|
||||||
def just_newlines(current_lines):
|
Equal:
|
||||||
for line in current_lines:
|
- Assure that the start is a newline, otherwise parse until we get
|
||||||
line = line.lstrip('\t \n\r')
|
one.
|
||||||
if line and line[0] != '#':
|
- Copy from parsed_until_line + 1 to max(i2 + 1)
|
||||||
return False
|
- Make sure that the indentation is correct (e.g. add DEDENT)
|
||||||
return True
|
- Add old and change positions
|
||||||
|
Insert:
|
||||||
|
- Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not
|
||||||
|
much more.
|
||||||
|
Always:
|
||||||
|
- Set parsed_until_line
|
||||||
|
'''
|
||||||
|
self._lines_new = lines_new
|
||||||
|
self._reset()
|
||||||
|
|
||||||
# Split only new lines. Distinction between \r\n is the tokenizer's
|
self._old_children = self._module.children
|
||||||
# job.
|
self._new_children = []
|
||||||
# It seems like there's no problem with form feed characters here,
|
self._prefix = ''
|
||||||
# because we're not counting lines.
|
|
||||||
self._lines = source.splitlines(True)
|
lines_old = splitlines(self._parser.source, keepends=True)
|
||||||
current_lines = []
|
sm = difflib.SequenceMatcher(None, lines_old, lines_new)
|
||||||
is_decorator = False
|
for operation, i1, i2, j1, j2 in sm.get_opcodes():
|
||||||
# Use -1, because that indent is always smaller than any other.
|
print(operation)
|
||||||
indent_list = [-1, 0]
|
if operation == 'equal':
|
||||||
new_indent = False
|
line_offset = j1 - i1
|
||||||
parentheses_level = 0
|
self._copy_from_old_parser(line_offset, i2 + 1, j2 + 1)
|
||||||
flow_indent = None
|
elif operation == 'replace':
|
||||||
previous_line = None
|
self._delete_count += 1
|
||||||
# All things within flows are simply being ignored.
|
self._insert(j2 + 1)
|
||||||
for i, l in enumerate(self._lines):
|
elif operation == 'insert':
|
||||||
# Handle backslash newline escaping.
|
self._insert(j2 + 1)
|
||||||
if l.endswith('\\\n') or l.endswith('\\\r\n'):
|
|
||||||
if previous_line is not None:
|
|
||||||
previous_line += l
|
|
||||||
else:
|
else:
|
||||||
previous_line = l
|
assert operation == 'delete'
|
||||||
continue
|
self._delete_count += 1 # For statistics
|
||||||
if previous_line is not None:
|
|
||||||
l = previous_line + l
|
|
||||||
previous_line = None
|
|
||||||
|
|
||||||
# check for dedents
|
def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new):
|
||||||
s = l.lstrip('\t \n\r')
|
while until_line_new < self._parsed_until_line:
|
||||||
indent = len(l) - len(s)
|
parsed_until_line_old = self._parsed_until_line + line_offset
|
||||||
if not s or s[0] == '#':
|
if matches:
|
||||||
current_lines.append(l) # Just ignore comments and blank lines
|
# TODO check missing indent/dedent
|
||||||
continue
|
_copy_p()
|
||||||
|
self._update_positions(line_offset)
|
||||||
if new_indent and not parentheses_level:
|
# We have copied as much as possible (but definitely not too
|
||||||
if indent > indent_list[-2]:
|
# much). Therefore we escape, even if we're not at the end. The
|
||||||
# Set the actual indent, not just the random old indent + 1.
|
# rest will be parsed.
|
||||||
indent_list[-1] = indent
|
# Might not reach until the end, because there's a statement
|
||||||
new_indent = False
|
# that is not finished.
|
||||||
|
|
||||||
while indent < indent_list[-1]: # -> dedent
|
|
||||||
indent_list.pop()
|
|
||||||
# This automatically resets the flow_indent if there was a
|
|
||||||
# dedent or a flow just on one line (with one simple_stmt).
|
|
||||||
new_indent = False
|
|
||||||
if flow_indent is None and current_lines and not parentheses_level:
|
|
||||||
yield gen_part()
|
|
||||||
flow_indent = None
|
|
||||||
|
|
||||||
# Check lines for functions/classes and split the code there.
|
|
||||||
if flow_indent is None:
|
|
||||||
m = self._keyword_re.match(l)
|
|
||||||
if m:
|
|
||||||
# Strip whitespace and colon from flows as a check.
|
|
||||||
if m.group(1).strip(' \t\r\n:') in FLOWS:
|
|
||||||
if not parentheses_level:
|
|
||||||
flow_indent = indent
|
|
||||||
else:
|
|
||||||
if not is_decorator and not just_newlines(current_lines):
|
|
||||||
yield gen_part()
|
|
||||||
is_decorator = '@' == m.group(1)
|
|
||||||
if not is_decorator:
|
|
||||||
parentheses_level = 0
|
|
||||||
# The new indent needs to be higher
|
|
||||||
indent_list.append(indent + 1)
|
|
||||||
new_indent = True
|
|
||||||
elif is_decorator:
|
|
||||||
is_decorator = False
|
|
||||||
|
|
||||||
parentheses_level = \
|
|
||||||
max(0, (l.count('(') + l.count('[') + l.count('{') -
|
|
||||||
l.count(')') - l.count(']') - l.count('}')))
|
|
||||||
|
|
||||||
current_lines.append(l)
|
|
||||||
|
|
||||||
if previous_line is not None:
|
|
||||||
current_lines.append(previous_line)
|
|
||||||
if current_lines:
|
|
||||||
yield gen_part()
|
|
||||||
|
|
||||||
def _parse(self, source):
|
|
||||||
""" :type source: str """
|
|
||||||
added_newline = False
|
|
||||||
if not source or source[-1] != '\n':
|
|
||||||
# To be compatible with Pythons grammar, we need a newline at the
|
|
||||||
# end. The parser would handle it, but since the fast parser abuses
|
|
||||||
# the normal parser in various ways, we need to care for this
|
|
||||||
# ourselves.
|
|
||||||
source += '\n'
|
|
||||||
added_newline = True
|
|
||||||
|
|
||||||
next_code_part_end_line = code_part_end_line = 1
|
|
||||||
start = 0
|
|
||||||
nodes = list(self.root_node.all_sub_nodes())
|
|
||||||
# Now we can reset the node, because we have all the old nodes.
|
|
||||||
self.root_node.reset_node()
|
|
||||||
self.current_node = self.root_node
|
|
||||||
last_end_line = 1
|
|
||||||
|
|
||||||
for code_part in self._split_parts(source):
|
|
||||||
next_code_part_end_line += code_part.count('\n')
|
|
||||||
# If the last code part parsed isn't equal to the current end_pos,
|
|
||||||
# we know that the parser went further (`def` start in a
|
|
||||||
# docstring). So just parse the next part.
|
|
||||||
if code_part_end_line == last_end_line:
|
|
||||||
self._parse_part(code_part, source[start:], code_part_end_line, nodes)
|
|
||||||
else:
|
|
||||||
self.number_of_misses += 1
|
|
||||||
# Means that some lines where not fully parsed. Parse it now.
|
|
||||||
# This is a very rare case. Should only happens with very
|
|
||||||
# strange code bits.
|
|
||||||
while last_end_line < next_code_part_end_line:
|
|
||||||
code_part_end_line = last_end_line
|
|
||||||
# We could calculate the src in a more complicated way to
|
|
||||||
# make caching here possible as well. However, this is
|
|
||||||
# complicated and error-prone. Since this is not very often
|
|
||||||
# called - just ignore it.
|
|
||||||
src = ''.join(self._lines[code_part_end_line - 1:])
|
|
||||||
self._parse_part(code_part, src, code_part_end_line, nodes)
|
|
||||||
last_end_line = self.current_node.end_pos[0]
|
|
||||||
debug.dbg("While parsing %s, starting with line %s wasn't included in split.",
|
|
||||||
self.module_path, code_part_end_line)
|
|
||||||
#assert code_part_end_line > last_end_line
|
|
||||||
# This means that the parser parsed faster than the last given
|
|
||||||
# `code_part`.
|
|
||||||
debug.dbg('While parsing %s, line %s slowed down the fast parser.',
|
|
||||||
self.module_path, code_part_end_line)
|
|
||||||
|
|
||||||
code_part_end_line = next_code_part_end_line
|
|
||||||
start += len(code_part)
|
|
||||||
|
|
||||||
last_end_line = self.current_node.end_pos[0]
|
|
||||||
|
|
||||||
if added_newline:
|
|
||||||
self.current_node.remove_last_newline()
|
|
||||||
|
|
||||||
# Now that the for loop is finished, we still want to close all nodes.
|
|
||||||
node = self.current_node
|
|
||||||
while node is not None:
|
|
||||||
node.close()
|
|
||||||
node = node.parent
|
|
||||||
|
|
||||||
debug.dbg('Parsed %s, with %s parsers in %s splits.'
|
|
||||||
% (self.module_path, self.number_parsers_used,
|
|
||||||
self.number_of_splits))
|
|
||||||
|
|
||||||
def _parse_part(self, source, parser_code, code_part_end_line, nodes):
|
|
||||||
"""
|
|
||||||
Side effect: Alters the list of nodes.
|
|
||||||
"""
|
|
||||||
h = hash(source)
|
|
||||||
for index, node in enumerate(nodes):
|
|
||||||
if node.hash == h and node.source == source:
|
|
||||||
node.reset_node()
|
|
||||||
nodes.remove(node)
|
|
||||||
parser_code = source
|
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
tokenizer = FastTokenizer(parser_code)
|
# Parse 1 line at least. We don't need more, because we just
|
||||||
self.number_parsers_used += 1
|
# want to get into a state where the old parser has starting
|
||||||
p = ParserWithRecovery(self._grammar, parser_code, self.module_path, tokenizer=tokenizer)
|
# statements again (not e.g. lines within parentheses).
|
||||||
|
self._parse(self._parsed_until_line + 1)
|
||||||
|
|
||||||
end = code_part_end_line - 1 + p.module.end_pos[0]
|
def _update_positions(self, line_offset, line_start, line_end):
|
||||||
used_lines = self._lines[code_part_end_line - 1:end - 1]
|
if line_offset == 0:
|
||||||
code_part_actually_used = ''.join(used_lines)
|
return
|
||||||
|
|
||||||
node = ParserNode(self.module, p, code_part_actually_used)
|
# Find start node:
|
||||||
|
node = self._parser.get_pared_node()
|
||||||
|
while True:
|
||||||
|
return node
|
||||||
|
|
||||||
indent = len(parser_code) - len(parser_code.lstrip('\t '))
|
def _insert(self, until_line_new):
|
||||||
|
self._insert_count += 1
|
||||||
|
self._parse(until_line_new)
|
||||||
|
|
||||||
self.current_node.add_node(node, code_part_end_line, indent)
|
def _get_before_insertion_node(self):
|
||||||
self.current_node = node
|
if not self._new_children:
|
||||||
|
return None
|
||||||
|
|
||||||
|
leaf = self._module.get_leaf_for_position((line, 0), include_prefixes=False)
|
||||||
|
while leaf.type != 'newline':
|
||||||
|
try:
|
||||||
|
leaf = leaf.get_previous_leaf()
|
||||||
|
except IndexError:
|
||||||
|
# TODO
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
class FastTokenizer(object):
|
node = leaf
|
||||||
|
while True:
|
||||||
|
parent = node.parent
|
||||||
|
print(parent)
|
||||||
|
if parent.type in ('suite', 'file_input'):
|
||||||
|
print(node)
|
||||||
|
print(i, line, node.end_pos)
|
||||||
|
assert node.end_pos[0] <= line
|
||||||
|
assert node.end_pos[1] == 0
|
||||||
|
return node
|
||||||
|
node = parent
|
||||||
|
|
||||||
|
def _parse(self, until_line):
|
||||||
"""
|
"""
|
||||||
Breaks when certain conditions are met, i.e. a new function or class opens.
|
Parses at least until the given line, but might just parse more until a
|
||||||
|
valid state is reached.
|
||||||
"""
|
"""
|
||||||
def __init__(self, source):
|
while until_line > self._parsed_until_line:
|
||||||
self.source = source
|
node = self._parse_scope_part(before_node, until_line)
|
||||||
self._gen = source_tokens(source, use_exact_op_types=True)
|
first_leaf = node.first_leaf()
|
||||||
self._closed = False
|
|
||||||
|
|
||||||
# fast parser options
|
before_node = self._get_before_insertion_node()
|
||||||
self.current = self.previous = NEWLINE, '', (0, 0)
|
if before_node is None:
|
||||||
self._in_flow = False
|
# The start of the file.
|
||||||
self._is_decorator = False
|
self.new_children += node.children
|
||||||
self._first_stmt = True
|
|
||||||
self._parentheses_level = 0
|
|
||||||
self._indent_counter = 0
|
|
||||||
self._flow_indent_counter = 0
|
|
||||||
self._returned_endmarker = False
|
|
||||||
self._expect_indent = False
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
""" Python 2 Compatibility """
|
|
||||||
return self.__next__()
|
|
||||||
|
|
||||||
def __next__(self):
|
|
||||||
if self._closed:
|
|
||||||
return self._finish_dedents()
|
|
||||||
|
|
||||||
typ, value, start_pos, prefix = current = next(self._gen)
|
|
||||||
if typ == ENDMARKER:
|
|
||||||
self._closed = True
|
|
||||||
self._returned_endmarker = True
|
|
||||||
return current
|
|
||||||
|
|
||||||
self.previous = self.current
|
|
||||||
self.current = current
|
|
||||||
|
|
||||||
if typ == INDENT:
|
|
||||||
self._indent_counter += 1
|
|
||||||
if not self._expect_indent and not self._first_stmt and not self._in_flow:
|
|
||||||
# This does not mean that there is an actual flow, it means
|
|
||||||
# that the INDENT is syntactically wrong.
|
|
||||||
self._flow_indent_counter = self._indent_counter - 1
|
|
||||||
self._in_flow = True
|
|
||||||
self._expect_indent = False
|
|
||||||
elif typ == DEDENT:
|
|
||||||
self._indent_counter -= 1
|
|
||||||
if self._in_flow:
|
|
||||||
if self._indent_counter == self._flow_indent_counter:
|
|
||||||
self._in_flow = False
|
|
||||||
else:
|
else:
|
||||||
self._closed = True
|
before_node.parent.children += node.children
|
||||||
return current
|
|
||||||
|
|
||||||
previous_type = self.previous[0]
|
def _parse_scope_node(self, before_node, until_line, line_offset=0):
|
||||||
if value in ('def', 'class') and self._parentheses_level:
|
# TODO speed up, shouldn't copy the whole thing all the time.
|
||||||
# Account for the fact that an open parentheses before a function
|
# memoryview?
|
||||||
# will reset the parentheses counter, but new lines before will
|
lines_after = self._lines_new[self._parsed_until_line + 1:]
|
||||||
# still be ignored. So check the prefix.
|
tokenizer = self._diff_tokenize(lines_after, until_line, line_offset)
|
||||||
|
self._parser = ParserWithRecovery(
|
||||||
|
self._parser._grammar,
|
||||||
|
source=None,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
start_parsing=False
|
||||||
|
)
|
||||||
|
return self._parser.parse()
|
||||||
|
|
||||||
# TODO what about flow parentheses counter resets in the tokenizer?
|
def _diff_tokenize(lines, until_line, line_offset=0):
|
||||||
self._parentheses_level = 0
|
is_first_token = True
|
||||||
# We need to simulate a newline before the indent, because the
|
omited_first_indent = False
|
||||||
# open parentheses ignored them.
|
indent_count = 0
|
||||||
if re.search('\n\s*', prefix):
|
tokens = generate_tokens(lambda: next(l, ''))
|
||||||
previous_type = NEWLINE
|
for token_info in tokens:
|
||||||
|
typ = token_info.type
|
||||||
|
if typ == 'indent':
|
||||||
|
indent_count += 1
|
||||||
|
if is_first_token:
|
||||||
|
omited_first_indent = True
|
||||||
|
# We want to get rid of indents that are only here because
|
||||||
|
# we only parse part of the file. These indents would only
|
||||||
|
# get parsed as error leafs, which doesn't make any sense.
|
||||||
|
continue
|
||||||
|
elif typ == 'dedent':
|
||||||
|
indent_count -= 1
|
||||||
|
if omited_first_indent and indent_count == 0:
|
||||||
|
# We are done here, only thing that can come now is an
|
||||||
|
# endmarker or another dedented code block.
|
||||||
|
break
|
||||||
|
elif typ == 'newline' and token_info.start_pos[0] >= until_line:
|
||||||
|
yield token_info
|
||||||
|
x = self.
|
||||||
|
import pdb; pdb.set_trace()
|
||||||
|
break
|
||||||
|
|
||||||
# Parentheses ignore the indentation rules. The other three stand for
|
is_first_token = False
|
||||||
# new lines.
|
if line_offset != 0:
|
||||||
if previous_type in (NEWLINE, INDENT, DEDENT) \
|
raise NotImplementedError
|
||||||
and not self._parentheses_level and typ not in (INDENT, DEDENT):
|
yield tokenize.TokenInfo(*token_info.string[1:])
|
||||||
if not self._in_flow:
|
|
||||||
if value in FLOWS:
|
|
||||||
self._flow_indent_counter = self._indent_counter
|
|
||||||
self._first_stmt = False
|
|
||||||
elif value in ('def', 'class', '@'):
|
|
||||||
# The values here are exactly the same check as in
|
|
||||||
# _split_parts, but this time with tokenize and therefore
|
|
||||||
# precise.
|
|
||||||
if not self._first_stmt and not self._is_decorator:
|
|
||||||
return self._close()
|
|
||||||
|
|
||||||
self._is_decorator = '@' == value
|
|
||||||
if not self._is_decorator:
|
|
||||||
self._first_stmt = False
|
|
||||||
self._expect_indent = True
|
|
||||||
elif self._expect_indent:
|
|
||||||
return self._close()
|
|
||||||
else:
|
else:
|
||||||
self._first_stmt = False
|
yield token_info
|
||||||
|
|
||||||
if value in '([{' and value:
|
yield tokenize.TokenInfo(tokenize.ENDMARKER, *token_info.string[1:])
|
||||||
self._parentheses_level += 1
|
|
||||||
elif value in ')]}' and value:
|
|
||||||
# Ignore closing parentheses, because they are all
|
|
||||||
# irrelevant for the indentation.
|
|
||||||
self._parentheses_level = max(self._parentheses_level - 1, 0)
|
|
||||||
return current
|
|
||||||
|
|
||||||
def _close(self):
|
|
||||||
if self._first_stmt:
|
|
||||||
# Continue like nothing has happened, because we want to enter
|
|
||||||
# the first class/function.
|
|
||||||
if self.current[1] != '@':
|
|
||||||
self._first_stmt = False
|
|
||||||
return self.current
|
|
||||||
else:
|
|
||||||
self._closed = True
|
|
||||||
return self._finish_dedents()
|
|
||||||
|
|
||||||
def _finish_dedents(self):
|
|
||||||
if self._indent_counter:
|
|
||||||
self._indent_counter -= 1
|
|
||||||
return DEDENT, '', self.current[2], ''
|
|
||||||
elif not self._returned_endmarker:
|
|
||||||
self._returned_endmarker = True
|
|
||||||
return ENDMARKER, '', self.current[2], self._get_prefix()
|
|
||||||
else:
|
|
||||||
raise StopIteration
|
|
||||||
|
|
||||||
def _get_prefix(self):
|
|
||||||
"""
|
|
||||||
We're using the current prefix for the endmarker to not loose any
|
|
||||||
information. However we care about "lost" lines. The prefix of the
|
|
||||||
current line (indent) will always be included in the current line.
|
|
||||||
"""
|
|
||||||
cur = self.current
|
|
||||||
while cur[0] == DEDENT:
|
|
||||||
cur = next(self._gen)
|
|
||||||
prefix = cur[3]
|
|
||||||
|
|
||||||
# \Z for the end of the string. $ is bugged, because it has the
|
|
||||||
# same behavior with or without re.MULTILINE.
|
|
||||||
return re.sub(r'[^\n]+\Z', '', prefix)
|
|
||||||
|
|||||||
@@ -86,7 +86,6 @@ def save_parser(path, parser, pickling=True):
|
|||||||
|
|
||||||
|
|
||||||
class ParserPickling(object):
|
class ParserPickling(object):
|
||||||
|
|
||||||
version = 26
|
version = 26
|
||||||
"""
|
"""
|
||||||
Version number (integer) for file system cache.
|
Version number (integer) for file system cache.
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ def test_add_to_end():
|
|||||||
class Two(Abc):
|
class Two(Abc):
|
||||||
def h(self):
|
def h(self):
|
||||||
self
|
self
|
||||||
|
|
||||||
""") # ^ here is the first completion
|
""") # ^ here is the first completion
|
||||||
|
|
||||||
b = " def g(self):\n" \
|
b = " def g(self):\n" \
|
||||||
|
|||||||
Reference in New Issue
Block a user