Playing with the fast parser implementation.

This commit is contained in:
Dave Halter
2015-01-16 15:23:49 +01:00
parent cc7483498c
commit e477fab856
3 changed files with 83 additions and 66 deletions

View File

@@ -4,6 +4,7 @@ anything changes, it only reparses the changed parts. But because it's not
finished (and still not working as I want), I won't document it any further. finished (and still not working as I want), I won't document it any further.
""" """
import re import re
from itertools import chain
from jedi._compatibility import use_metaclass, unicode from jedi._compatibility import use_metaclass, unicode
from jedi import settings from jedi import settings
@@ -15,15 +16,14 @@ from jedi import cache
from jedi.parser.tokenize import source_tokens, FLOWS, NEWLINE, COMMENT, ENDMARKER from jedi.parser.tokenize import source_tokens, FLOWS, NEWLINE, COMMENT, ENDMARKER
class Module(pr.Module, pr.Simple): class FastModule(pr.Module, pr.Simple):
type = 'file_input'
def __init__(self, parsers): def __init__(self, parsers):
super(Module, self).__init__(self, (1, 0)) super(FastModule, self).__init__([])
self.parsers = parsers self.parsers = parsers
self.reset_caches() self.reset_caches()
self.start_pos = 1, 0
self.end_pos = None, None
def reset_caches(self): def reset_caches(self):
""" This module does a whole lot of caching, because it uses different """ This module does a whole lot of caching, because it uses different
parsers. """ parsers. """
@@ -69,45 +69,53 @@ class CachedFastParser(type):
class ParserNode(object): class ParserNode(object):
def __init__(self, parser, code, parent=None): def __init__(self, fast_module, parser, code, parent=None):
self._fast_module = fast_module
self.parent = parent self.parent = parent
self.children = [] self.parser_children = []
# must be created before new things are added to it. # must be created before new things are added to it.
self.save_contents(parser, code) self.save_contents(parser, code)
def save_contents(self, parser, code): def save_contents(self, parser, code):
print('SAVE')
self.code = code self.code = code
self.hash = hash(code) self.hash = hash(code)
self.parser = parser self.parser = parser
try: try:
# with fast_parser we have either 1 subscope or only statements. # With fast_parser we have either 1 subscope or only statements.
self.content_scope = parser.module.subscopes[0] self.content_scope = parser.module.subscopes[0]
except IndexError: except IndexError:
self.content_scope = parser.module self.content_scope = self._fast_module
"""
scope = self.content_scope scope = self.content_scope
self._contents = {} self._contents = {}
for c in pr.SCOPE_CONTENTS: for c in pr.SCOPE_CONTENTS:
self._contents[c] = list(getattr(scope, c)) self._contents[c] = list(getattr(scope, c))
self._is_generator = scope.is_generator self._is_generator = scope.is_generator
"""
self.old_children = self.children self.old_children = self.parser_children
self.children = [] self.parser_children = []
def reset_contents(self): def reset_contents(self):
"""
scope = self.content_scope scope = self.content_scope
for key, c in self._contents.items(): for key, c in self._contents.items():
setattr(scope, key, list(c)) setattr(scope, key, list(c))
scope.is_generator = self._is_generator scope.is_generator = self._is_generator
"""
"""
if self.parent is None: if self.parent is None:
# Global vars of the first one can be deleted, in the global scope # Global vars of the first one can be deleted, in the global scope
# they make no sense. # they make no sense.
self.parser.module.global_vars = [] self.parser.module.global_vars = []
"""
for c in self.children: for c in self.parser_children:
c.reset_contents() c.reset_contents()
def parent_until_indent(self, indent=None): def parent_until_indent(self, indent=None):
@@ -140,34 +148,35 @@ class ParserNode(object):
def _set_items(self, parser, set_parent=False): def _set_items(self, parser, set_parent=False):
# insert parser objects into current structure # insert parser objects into current structure
scope = self.content_scope scope = self.content_scope
for c in pr.SCOPE_CONTENTS: if set_parent:
content = getattr(scope, c) for child in parser.module.children:
items = getattr(parser.module, c) child.parent = scope
if set_parent: scope.children.append(child)
for i in items: print('\t\t', scope, child)
if i is None: """
continue # happens with empty returns if isinstance(i, (pr.Function, pr.Class)):
i.parent = scope.use_as_parent for d in i.decorators:
if isinstance(i, (pr.Function, pr.Class)): d.parent = scope
for d in i.decorators: """
d.parent = scope.use_as_parent # TODO global_vars ? is_generator ?
content += items """
# global_vars
cur = self cur = self
while cur.parent is not None: while cur.parent is not None:
cur = cur.parent cur = cur.parent
cur.parser.module.global_vars += parser.module.global_vars cur.parser.module.global_vars += parser.module.global_vars
scope.is_generator |= parser.module.is_generator scope.is_generator |= parser.module.is_generator
"""
def add_node(self, node, set_parent=False): def add_node(self, node, set_parent=False):
"""Adding a node means adding a node that was already added earlier""" """Adding a node means adding a node that was already added earlier"""
self.children.append(node) print('ADD')
self.parser_children.append(node)
self._set_items(node.parser, set_parent=set_parent) self._set_items(node.parser, set_parent=set_parent)
node.old_children = node.children # TODO potential memory leak? node.old_children = node.parser_children # TODO potential memory leak?
node.children = [] node.parser_children = []
"""
scope = self.content_scope scope = self.content_scope
while scope is not None: while scope is not None:
#print('x',scope) #print('x',scope)
@@ -175,10 +184,12 @@ class ParserNode(object):
# TODO This seems like a strange thing. Check again. # TODO This seems like a strange thing. Check again.
scope.end_pos = node.content_scope.end_pos scope.end_pos = node.content_scope.end_pos
scope = scope.parent scope = scope.parent
"""
return node return node
def add_parser(self, parser, code): def add_parser(self, parser, code):
return self.add_node(ParserNode(parser, code, self), True) print('add parser')
return self.add_node(ParserNode(self._fast_module, parser, code, self), True)
class FastParser(use_metaclass(CachedFastParser)): class FastParser(use_metaclass(CachedFastParser)):
@@ -189,10 +200,11 @@ class FastParser(use_metaclass(CachedFastParser)):
# set values like `pr.Module`. # set values like `pr.Module`.
self._grammar = grammar self._grammar = grammar
self.module_path = module_path self.module_path = module_path
print(module_path)
self.current_node = None self.current_node = None
self.parsers = [] self.parsers = []
self.module = Module(self.parsers) self.module = FastModule(self.parsers)
self.reset_caches() self.reset_caches()
try: try:
@@ -285,6 +297,7 @@ class FastParser(use_metaclass(CachedFastParser)):
p = None p = None
is_first = True is_first = True
for code_part in self._split_parts(code): for code_part in self._split_parts(code):
print(repr(code_part))
if is_first or line_offset >= p.module.end_pos[0]: if is_first or line_offset >= p.module.end_pos[0]:
indent = len(code_part) - len(code_part.lstrip('\t ')) indent = len(code_part) - len(code_part.lstrip('\t '))
if is_first and self.current_node is not None: if is_first and self.current_node is not None:
@@ -308,11 +321,12 @@ class FastParser(use_metaclass(CachedFastParser)):
code_part_actually_used = '\n'.join(used_lines) code_part_actually_used = '\n'.join(used_lines)
if is_first and p.module.subscopes: if is_first and p.module.subscopes:
print('NOXXXX')
# special case, we cannot use a function subscope as a # special case, we cannot use a function subscope as a
# base scope, subscopes would save all the other contents # base scope, subscopes would save all the other contents
new = empty_parser() new = empty_parser()
if self.current_node is None: if self.current_node is None:
self.current_node = ParserNode(new, '') self.current_node = ParserNode(self.module, new, '')
else: else:
self.current_node.save_contents(new, '') self.current_node.save_contents(new, '')
self.parsers.append(new) self.parsers.append(new)
@@ -320,7 +334,7 @@ class FastParser(use_metaclass(CachedFastParser)):
if is_first: if is_first:
if self.current_node is None: if self.current_node is None:
self.current_node = ParserNode(p, code_part_actually_used) self.current_node = ParserNode(self.module, p, code_part_actually_used)
else: else:
self.current_node.save_contents(p, code_part_actually_used) self.current_node.save_contents(p, code_part_actually_used)
else: else:
@@ -344,28 +358,28 @@ class FastParser(use_metaclass(CachedFastParser)):
else: else:
self.parsers.append(empty_parser()) self.parsers.append(empty_parser())
""" TODO used?
self.module.end_pos = self.parsers[-1].module.end_pos self.module.end_pos = self.parsers[-1].module.end_pos
"""
# print(self.parsers[0].module.get_code()) # print(self.parsers[0].module.get_code())
def _get_parser(self, code, parser_code, line_offset, nodes, no_docstr): def _get_parser(self, code, parser_code, line_offset, nodes, no_docstr):
h = hash(code) h = hash(code)
for index, node in enumerate(nodes): for index, node in enumerate(nodes):
if node.hash != h or node.code != code: if node.hash == h and node.code == code:
continue if node != self.current_node:
offset = int(nodes[0] == self.current_node)
if node != self.current_node: self.current_node.old_children.pop(index - offset)
offset = int(nodes[0] == self.current_node) p = node.parser
self.current_node.old_children.pop(index - offset) m = p.module
p = node.parser m.line_offset += line_offset + 1 - m.start_pos[0]
m = p.module break
m.line_offset += line_offset + 1 - m.start_pos[0]
break
else: else:
tokenizer = FastTokenizer(parser_code, line_offset) tokenizer = FastTokenizer(parser_code, line_offset)
p = Parser(parser_code, self.module_path, tokenizer=tokenizer, p = Parser(self._grammar, parser_code, self.module_path, tokenizer=tokenizer)
top_module=self.module, no_docstr=no_docstr) #p.module.parent = self.module # With the new parser this is not
p.module.parent = self.module # necessary anymore?
node = None node = None
return p, node return p, node
@@ -392,21 +406,27 @@ class FastTokenizer(object):
self.parser_indent = self.old_parser_indent = 0 self.parser_indent = self.old_parser_indent = 0
self.is_decorator = False self.is_decorator = False
self.first_stmt = True self.first_stmt = True
self._add_end_marker = False
self.parentheses_level = 0 self.parentheses_level = 0
def __iter__(self):
return self
def next(self): def next(self):
""" Python 2 Compatibility """ """ Python 2 Compatibility """
return self.__next__() return self.__next__()
def __next__(self): def __next__(self):
if self.closed: if self.closed:
raise common.MultiLevelStopIteration() if self._add_end_marker:
self._add_end_marker = False
start_pos = self.current[2]
return tokenize.ENDMARKER, '', start_pos, ''
raise StopIteration
current = next(self.gen) typ, value, start_pos, prefix = current = next(self.gen)
tok_type = current.type if typ == ENDMARKER:
tok_str = current.string self.closed = True
if tok_type == ENDMARKER:
raise common.MultiLevelStopIteration()
self.previous = self.current self.previous = self.current
self.current = current self.current = current
@@ -417,14 +437,14 @@ class FastTokenizer(object):
def close(): def close():
if not self.first_stmt: if not self.first_stmt:
self._add_end_marker = True
self.closed = True self.closed = True
raise common.MultiLevelStopIteration()
# Ignore comments/newlines, irrelevant for indentation. # Ignore comments/newlines, irrelevant for indentation.
if self.previous.type in (None, NEWLINE) \ if self.previous[0] in (None, NEWLINE) \
and tok_type not in (COMMENT, NEWLINE): and typ not in (COMMENT, NEWLINE):
# print c, tok_name[c[0]] # print c, tok_name[c[0]]
indent = current.start_pos[1] indent = start_pos[1]
if self.parentheses_level: if self.parentheses_level:
# parentheses ignore the indentation rules. # parentheses ignore the indentation rules.
pass pass
@@ -440,18 +460,18 @@ class FastTokenizer(object):
self.new_indent = False self.new_indent = False
if not self.in_flow: if not self.in_flow:
if tok_str in FLOWS or tok_str in breaks: if value in FLOWS or value in breaks:
self.in_flow = tok_str in FLOWS self.in_flow = value in FLOWS
if not self.is_decorator and not self.in_flow: if not self.is_decorator and not self.in_flow:
close() close()
self.is_decorator = '@' == tok_str self.is_decorator = '@' == value
if not self.is_decorator: if not self.is_decorator:
self.old_parser_indent = self.parser_indent self.old_parser_indent = self.parser_indent
self.parser_indent += 1 # new scope: must be higher self.parser_indent += 1 # new scope: must be higher
self.new_indent = True self.new_indent = True
if tok_str != '@': if value != '@':
if self.first_stmt and not self.new_indent: if self.first_stmt and not self.new_indent:
self.parser_indent = indent self.parser_indent = indent
self.first_stmt = False self.first_stmt = False
@@ -459,8 +479,8 @@ class FastTokenizer(object):
# Ignore closing parentheses, because they are all # Ignore closing parentheses, because they are all
# irrelevant for the indentation. # irrelevant for the indentation.
if tok_str in '([{': if value in '([{':
self.parentheses_level += 1 self.parentheses_level += 1
elif tok_str in ')]}': elif value in ')]}':
self.parentheses_level = max(self.parentheses_level - 1, 0) self.parentheses_level = max(self.parentheses_level - 1, 0)
return current return current

View File

@@ -39,9 +39,6 @@ from jedi._compatibility import (next, Python3Method, encoding, is_py3,
from jedi import cache from jedi import cache
SCOPE_CONTENTS = 'asserts', 'subscopes', 'imports', 'statements', 'returns'
def is_node(node, *symbol_names): def is_node(node, *symbol_names):
try: try:
type = node.type type = node.type

View File

@@ -136,7 +136,7 @@ On Linux, if environment variable ``$XDG_CACHE_HOME`` is set,
# parser # parser
# ---------------- # ----------------
fast_parser = False fast_parser = True
""" """
Use the fast parser. This means that reparsing is only being done if Use the fast parser. This means that reparsing is only being done if
something has been changed e.g. to a function. If this happens, only the something has been changed e.g. to a function. If this happens, only the