1
0
forked from VimPlug/jedi

Trying to restructure the fast parser.

This commit is contained in:
Dave Halter
2015-01-19 14:49:44 +01:00
parent d6b3b76d26
commit ce793b1066
2 changed files with 92 additions and 83 deletions

View File

@@ -20,9 +20,9 @@ from jedi.parser.tokenize import (source_tokens, FLOWS, NEWLINE, COMMENT,
class FastModule(pr.Module, pr.Simple): class FastModule(pr.Module, pr.Simple):
type = 'file_input' type = 'file_input'
def __init__(self, parsers): def __init__(self):
super(FastModule, self).__init__([]) super(FastModule, self).__init__([])
self.parsers = parsers self.modules = []
self.reset_caches() self.reset_caches()
def reset_caches(self): def reset_caches(self):
@@ -35,21 +35,21 @@ class FastModule(pr.Module, pr.Simple):
if name.startswith('__'): if name.startswith('__'):
raise AttributeError('Not available!') raise AttributeError('Not available!')
else: else:
return getattr(self.parsers[0].module, name) return getattr(self.modules[0], name)
@property @property
@cache.underscore_memoization @cache.underscore_memoization
def used_names(self): def used_names(self):
""" """
used_names = {} used_names = {}
for p in self.parsers: for m in self.modules:
for k, statement_set in p.module.used_names.items(): for k, statement_set in m.used_names.items():
if k in used_names: if k in used_names:
used_names[k] |= statement_set used_names[k] |= statement_set
else: else:
used_names[k] = set(statement_set) used_names[k] = set(statement_set)
""" """
return MergedNamesDict([p.module.used_names for p in self.parsers]) return MergedNamesDict([m.used_names for m in self.modules])
def __repr__(self): def __repr__(self):
return "<fast.%s: %s@%s-%s>" % (type(self).__name__, self.name, return "<fast.%s: %s@%s-%s>" % (type(self).__name__, self.name,
@@ -87,16 +87,23 @@ class CachedFastParser(type):
class ParserNode(object): class ParserNode(object):
def __init__(self, fast_module, parser, code, parent=None): def __init__(self, fast_module, parent=None):
self._fast_module = fast_module self._fast_module = fast_module
self.parent = parent self.parent = parent
self.parser_children = [] self.node_children = []
# must be created before new things are added to it. self.code = None
self.save_contents(parser, code) self.hash = None
self.parser = None
def save_contents(self, parser, code): def __repr__(self):
print('SAVE') if self.parser is None:
return '<%s: empty>' % type(self).__name__
module = self.parser.module
return '<%s: %s-%s>' % (type(self).__name__, module.start_pos, module.end_pos)
def set_parser(self, parser, code):
self.code = code self.code = code
self.hash = hash(code) self.hash = hash(code)
self.parser = parser self.parser = parser
@@ -116,8 +123,7 @@ class ParserNode(object):
self._is_generator = scope.is_generator self._is_generator = scope.is_generator
""" """
self.old_children = self.parser_children self.node_children = []
self.parser_children = []
def reset_contents(self): def reset_contents(self):
""" """
@@ -133,34 +139,31 @@ class ParserNode(object):
# they make no sense. # they make no sense.
self.parser.module.global_vars = [] self.parser.module.global_vars = []
""" """
# TODO REMOVE
for c in self.parser_children:
c.reset_contents()
def close(self): def close(self):
""" """
Closes the current parser node. This means that after this no further Closes the current parser node. This means that after this no further
nodes should be added anymore. nodes should be added anymore.
""" """
print('CLOSE NODE', self.parent, self.parser_children) print('CLOSE NODE', self.parent, self.node_children)
print(self.parser.module.names_dict, [p.parser.module.names_dict for p in print(self.parser.module.names_dict, [p.parser.module.names_dict for p in
self.parser_children]) self.node_children])
# We only need to replace the dict if multiple dictionaries are used: # We only need to replace the dict if multiple dictionaries are used:
if self.parser_children: if self.node_children:
dcts = [n.parser.module.names_dict for n in self.parser_children] dcts = [n.parser.module.names_dict for n in self.node_children]
dct = MergedNamesDict([self._names_dict_scope.names_dict] + dcts) dct = MergedNamesDict([self._names_dict_scope.names_dict] + dcts)
self._content_scope.names_dict = dct self._content_scope.names_dict = dct
def parent_until_indent(self, indent=None): def parent_until_indent(self, indent=None):
if indent is None or self.indent >= indent and self.parent: if indent is None or self._indent >= indent and self.parent:
self.old_children = []
if self.parent is not None: if self.parent is not None:
self.close() self.close()
return self.parent.parent_until_indent(indent) return self.parent.parent_until_indent(indent)
return self return self
@property @property
def indent(self): def _indent(self):
if not self.parent: if not self.parent:
return 0 return 0
module = self.parser.module module = self.parser.module
@@ -202,13 +205,17 @@ class ParserNode(object):
scope.is_generator |= parser.module.is_generator scope.is_generator |= parser.module.is_generator
""" """
def add_node(self, node, set_parent=False): def add_node(self, node, line_offset):
"""Adding a node means adding a node that was already added earlier""" """Adding a node means adding a node that was already added earlier"""
print('ADD') print('ADD')
self.parser_children.append(node) # Changing the line offsets is very important, because if they don't
self._set_items(node.parser, set_parent=set_parent) # fit, all the start_pos values will be wrong.
node.old_children = node.parser_children # TODO potential memory leak? m = node.parser.module
node.parser_children = [] m.line_offset += line_offset + 1 - m.start_pos[0]
self.node_children.append(node)
self._set_items(node.parser, set_parent=node.parent == self)
node.node_children = []
""" """
scope = self.content_scope scope = self.content_scope
@@ -222,9 +229,20 @@ class ParserNode(object):
return node return node
def add_parser(self, parser, code): def add_parser(self, parser, code):
# TODO REMOVE
raise NotImplementedError
print('add parser') print('add parser')
return self.add_node(ParserNode(self._fast_module, parser, code, self), True) return self.add_node(ParserNode(self._fast_module, parser, code, self), True)
def all_nodes(self):
"""
Returns all nodes including nested ones.
"""
yield self
for n in self.node_children:
for y in n.all_nodes():
yield y
class FastParser(use_metaclass(CachedFastParser)): class FastParser(use_metaclass(CachedFastParser)):
@@ -234,20 +252,20 @@ class FastParser(use_metaclass(CachedFastParser)):
# set values like `pr.Module`. # set values like `pr.Module`.
self._grammar = grammar self._grammar = grammar
self.module_path = module_path self.module_path = module_path
print(module_path)
self.current_node = None self._reset_caches()
self.parsers = []
self.module = FastModule(self.parsers)
self.reset_caches()
try: try:
self._parse(code) self._parse(code)
except: except:
# FastParser is cached, be careful with exceptions # FastParser is cached, be careful with exceptions
del self.parsers[:] self._reset_caches()
raise raise
def _reset_caches(self):
self.module = FastModule()
self.current_node = ParserNode(self.module)
def update(self, code): def update(self, code):
self.reset_caches() self.reset_caches()
@@ -255,7 +273,7 @@ class FastParser(use_metaclass(CachedFastParser)):
self._parse(code) self._parse(code)
except: except:
# FastParser is cached, be careful with exceptions # FastParser is cached, be careful with exceptions
del self.parsers[:] self._reset_caches()
raise raise
def _split_parts(self, code): def _split_parts(self, code):
@@ -320,57 +338,45 @@ class FastParser(use_metaclass(CachedFastParser)):
def _parse(self, code): def _parse(self, code):
""" :type code: str """ """ :type code: str """
def empty_parser(): def empty_parser_node():
new, temp = self._get_parser(unicode(''), unicode(''), 0, [], False) return self._get_node(unicode(''), unicode(''), 0, [], False)
return new
del self.parsers[:]
line_offset = 0 line_offset = 0
start = 0 start = 0
p = None p = None
is_first = True is_first = True
nodes = self.current_node.all_nodes()
for code_part in self._split_parts(code): for code_part in self._split_parts(code):
if is_first or line_offset + 1 == p.module.end_pos[0]: if is_first or line_offset + 1 == p.module.end_pos[0]:
print(repr(code_part)) print(repr(code_part))
indent = len(code_part) - len(code_part.lstrip('\t ')) indent = len(code_part) - len(code_part.lstrip('\t '))
if is_first and self.current_node is not None: self.current_node = self.current_node.parent_until_indent(indent)
nodes = [self.current_node]
else:
nodes = []
if self.current_node is not None:
self.current_node = self.current_node.parent_until_indent(indent)
nodes += self.current_node.old_children
# check if code_part has already been parsed # check if code_part has already been parsed
# print '#'*45,line_offset, p and p.module.end_pos, '\n', code_part # print '#'*45,line_offset, p and p.module.end_pos, '\n', code_part
p, node = self._get_parser(code_part, code[start:], self.current_node = self._get_node(code_part, code[start:],
line_offset, nodes, not is_first) line_offset, nodes, not is_first)
print('HmmmmA', p.module.names_dict) print('HmmmmA', self.current_node.parser.module.names_dict)
# The actual used code_part is different from the given code if is_first and self.current_node.parser.module.subscopes:
# part, because of docstrings for example there's a chance that
# splits are wrong.
used_lines = self._lines[line_offset:p.module.end_pos[0]]
code_part_actually_used = '\n'.join(used_lines)
if is_first and p.module.subscopes:
print('NOXXXX') print('NOXXXX')
# special case, we cannot use a function subscope as a raise NotImplementedError
# Special case, we cannot use a function subscope as a
# base scope, subscopes would save all the other contents # base scope, subscopes would save all the other contents
new = empty_parser() new = empty_parser_node() # TODO should be node =
if self.current_node is None: self.current_node.set_parser(new, '')
self.current_node = ParserNode(self.module, new, '')
else:
self.current_node.save_contents(new, '')
self.parsers.append(new) self.parsers.append(new)
is_first = False is_first = False
"""
if is_first: if is_first:
if self.current_node is None: if self.current_node is None:
self.current_node = ParserNode(self.module, p, code_part_actually_used) self.current_node = ParserNode(self.module, p, code_part_actually_used)
else: else:
self.current_node.save_contents(p, code_part_actually_used) pass
else: else:
if node is None: if node is None:
self.current_node = \ self.current_node = \
@@ -379,6 +385,7 @@ class FastParser(use_metaclass(CachedFastParser)):
self.current_node = self.current_node.add_node(node) self.current_node = self.current_node.add_node(node)
self.parsers.append(p) self.parsers.append(p)
"""
is_first = False is_first = False
#else: #else:
@@ -387,11 +394,13 @@ class FastParser(use_metaclass(CachedFastParser)):
line_offset += code_part.count('\n') + 1 line_offset += code_part.count('\n') + 1
start += len(code_part) + 1 # +1 for newline start += len(code_part) + 1 # +1 for newline
# Now that the for loop is finished, we still want to close all nodes.
if self.parsers: if self.parsers:
self.current_node = self.current_node.parent_until_indent() self.current_node = self.current_node.parent_until_indent()
self.current_node.close() self.current_node.close()
else: else:
self.parsers.append(empty_parser()) raise NotImplementedError
self.parsers.append(empty_parser_node())
""" TODO used? """ TODO used?
self.module.end_pos = self.parsers[-1].module.end_pos self.module.end_pos = self.parsers[-1].module.end_pos
@@ -399,30 +408,33 @@ class FastParser(use_metaclass(CachedFastParser)):
# print(self.parsers[0].module.get_code()) # print(self.parsers[0].module.get_code())
def _get_parser(self, code, parser_code, line_offset, nodes, no_docstr): def _get_node(self, code, parser_code, line_offset, nodes, no_docstr):
"""
Side effect: Alters the list of nodes.
"""
h = hash(code) h = hash(code)
for index, node in enumerate(nodes): for index, node in enumerate(list(nodes)):
print('EQ', node, repr(node.code), repr(code))
if node.hash == h and node.code == code: if node.hash == h and node.code == code:
if node != self.current_node: nodes.remove(node)
offset = int(nodes[0] == self.current_node)
self.current_node.old_children.pop(index - offset)
p = node.parser
m = p.module
m.line_offset += line_offset + 1 - m.start_pos[0]
break break
else: else:
print('ACTUALLY PARSING')
tokenizer = FastTokenizer(parser_code, line_offset) tokenizer = FastTokenizer(parser_code, line_offset)
p = Parser(self._grammar, parser_code, self.module_path, tokenizer=tokenizer) p = Parser(self._grammar, parser_code, self.module_path, tokenizer=tokenizer)
#p.module.parent = self.module # With the new parser this is not #p.module.parent = self.module # With the new parser this is not
# necessary anymore? # necessary anymore?
node = None node = ParserNode(self.module, self.current_node)
return p, node # The actual used code_part is different from the given code
# part, because of docstrings for example there's a chance that
# splits are wrong.
used_lines = self._lines[line_offset:p.module.end_pos[0] - 1]
code_part_actually_used = '\n'.join(used_lines)
node.set_parser(p, code_part_actually_used)
def reset_caches(self): self.current_node.add_node(node, line_offset)
self.module.reset_caches() return node
if self.current_node is not None:
self.current_node.reset_contents()
class FastTokenizer(object): class FastTokenizer(object):

View File

@@ -609,9 +609,6 @@ class SubModule(Scope, Module):
# this may be changed depending on fast_parser # this may be changed depending on fast_parser
self.line_offset = 0 self.line_offset = 0
if 0:
self.use_as_parent = top_module or self
def set_global_names(self, names): def set_global_names(self, names):
""" """
Global means in these context a function (subscope) which has a global Global means in these context a function (subscope) which has a global