1
0
forked from VimPlug/jedi

Merge pull request #430 from hattya/dev

improve fast parser for incremental parsing
This commit is contained in:
Dave Halter
2014-07-17 14:24:17 +04:30

View File

@@ -183,6 +183,9 @@ class ParserNode(object):
class FastParser(use_metaclass(CachedFastParser)): class FastParser(use_metaclass(CachedFastParser)):
_keyword_re = re.compile('^[ \t]*(def|class|@|%s)' % '|'.join(tokenize.FLOWS))
def __init__(self, code, module_path=None): def __init__(self, code, module_path=None):
# set values like `pr.Module`. # set values like `pr.Module`.
self.module_path = module_path self.module_path = module_path
@@ -196,7 +199,7 @@ class FastParser(use_metaclass(CachedFastParser)):
self._parse(code) self._parse(code)
except: except:
# FastParser is cached, be careful with exceptions # FastParser is cached, be careful with exceptions
self.parsers[:] = [] del self.parsers[:]
raise raise
def update(self, code): def update(self, code):
@@ -206,7 +209,7 @@ class FastParser(use_metaclass(CachedFastParser)):
self._parse(code) self._parse(code)
except: except:
# FastParser is cached, be careful with exceptions # FastParser is cached, be careful with exceptions
self.parsers[:] = [] del self.parsers[:]
raise raise
def _split_parts(self, code): def _split_parts(self, code):
@@ -215,34 +218,26 @@ class FastParser(use_metaclass(CachedFastParser)):
each part seperately and therefore cache parts of the file and not each part seperately and therefore cache parts of the file and not
everything. everything.
""" """
def add_part(): def gen_part():
txt = '\n'.join(current_lines) text = '\n'.join(current_lines)
if txt: del current_lines[:]
if add_to_last and parts: return text
parts[-1] += '\n' + txt
else:
parts.append(txt)
current_lines[:] = []
r_keyword = '^[ \t]*(def|class|@|%s)' % '|'.join(tokenize.FLOWS)
# Split only new lines. Distinction between \r\n is the tokenizer's # Split only new lines. Distinction between \r\n is the tokenizer's
# job. # job.
self._lines = code.split('\n') self._lines = code.split('\n')
current_lines = [] current_lines = []
parts = []
is_decorator = False is_decorator = False
current_indent = 0 current_indent = 0
old_indent = 0 old_indent = 0
new_indent = False new_indent = False
in_flow = False in_flow = False
add_to_last = False
# All things within flows are simply being ignored. # All things within flows are simply being ignored.
for i, l in enumerate(self._lines): for l in self._lines:
# check for dedents # check for dedents
m = re.match('^([\t ]*)(.?)', l) s = l.lstrip('\t ')
indent = len(m.group(1)) indent = len(l) - len(s)
if m.group(2) in ['', '#']: if not s or s[0] in ('#', '\r'):
current_lines.append(l) # just ignore comments and blank lines current_lines.append(l) # just ignore comments and blank lines
continue continue
@@ -250,8 +245,8 @@ class FastParser(use_metaclass(CachedFastParser)):
current_indent = indent current_indent = indent
new_indent = False new_indent = False
if not in_flow or indent < old_indent: if not in_flow or indent < old_indent:
add_part() if current_lines:
add_to_last = False yield gen_part()
in_flow = False in_flow = False
elif new_indent: elif new_indent:
current_indent = indent current_indent = indent
@@ -259,12 +254,12 @@ class FastParser(use_metaclass(CachedFastParser)):
# Check lines for functions/classes and split the code there. # Check lines for functions/classes and split the code there.
if not in_flow: if not in_flow:
m = re.match(r_keyword, l) m = self._keyword_re.match(l)
if m: if m:
in_flow = m.group(1) in tokenize.FLOWS in_flow = m.group(1) in tokenize.FLOWS
if not is_decorator and not in_flow: if not is_decorator and not in_flow:
add_part() if current_lines:
add_to_last = False yield gen_part()
is_decorator = '@' == m.group(1) is_decorator = '@' == m.group(1)
if not is_decorator: if not is_decorator:
old_indent = current_indent old_indent = current_indent
@@ -272,12 +267,15 @@ class FastParser(use_metaclass(CachedFastParser)):
new_indent = True new_indent = True
elif is_decorator: elif is_decorator:
is_decorator = False is_decorator = False
add_to_last = True
current_lines.append(l) current_lines.append(l)
add_part() if current_lines:
# skip newline at end of code,
return parts # since it is not counted by Parser
if not current_lines[-1]:
del current_lines[-1]
if current_lines:
yield gen_part()
def _parse(self, code): def _parse(self, code):
""" :type code: str """ """ :type code: str """
@@ -285,24 +283,20 @@ class FastParser(use_metaclass(CachedFastParser)):
new, temp = self._get_parser(unicode(''), unicode(''), 0, [], False) new, temp = self._get_parser(unicode(''), unicode(''), 0, [], False)
return new return new
parts = self._split_parts(code) del self.parsers[:]
self.parsers[:] = []
line_offset = 0 line_offset = 0
start = 0 start = 0
p = None p = None
is_first = True is_first = True
for code_part in self._split_parts(code):
for code_part in parts:
lines = code_part.count('\n') + 1
if is_first or line_offset >= p.module.end_pos[0]: if is_first or line_offset >= p.module.end_pos[0]:
indent = len(re.match(r'[ \t]*', code_part).group(0)) indent = len(code_part) - len(code_part.lstrip('\t '))
if is_first and self.current_node is not None: if is_first and self.current_node is not None:
nodes = [self.current_node] nodes = [self.current_node]
else: else:
nodes = [] nodes = []
if self.current_node is not None: if self.current_node is not None:
self.current_node = \ self.current_node = \
self.current_node.parent_until_indent(indent) self.current_node.parent_until_indent(indent)
nodes += self.current_node.old_children nodes += self.current_node.old_children
@@ -347,7 +341,7 @@ class FastParser(use_metaclass(CachedFastParser)):
#else: #else:
#print '#'*45, line_offset, p.module.end_pos, 'theheck\n', repr(code_part) #print '#'*45, line_offset, p.module.end_pos, 'theheck\n', repr(code_part)
line_offset += lines line_offset += code_part.count('\n') + 1
start += len(code_part) + 1 # +1 for newline start += len(code_part) + 1 # +1 for newline
if self.parsers: if self.parsers:
@@ -358,29 +352,26 @@ class FastParser(use_metaclass(CachedFastParser)):
self.module.end_pos = self.parsers[-1].module.end_pos self.module.end_pos = self.parsers[-1].module.end_pos
# print(self.parsers[0].module.get_code()) # print(self.parsers[0].module.get_code())
del code
def _get_parser(self, code, parser_code, line_offset, nodes, no_docstr): def _get_parser(self, code, parser_code, line_offset, nodes, no_docstr):
h = hash(code) h = hash(code)
hashes = [n.hash for n in nodes] for index, node in enumerate(nodes):
node = None if node.hash != h or node.code != code:
try: continue
index = hashes.index(h)
if nodes[index].code != code: if node != self.current_node:
raise ValueError() offset = int(nodes[0] == self.current_node)
except ValueError: self.current_node.old_children.pop(index - offset)
p = node.parser
m = p.module
m.line_offset += line_offset + 1 - m.start_pos[0]
break
else:
tokenizer = FastTokenizer(parser_code, line_offset) tokenizer = FastTokenizer(parser_code, line_offset)
p = Parser(parser_code, self.module_path, tokenizer=tokenizer, p = Parser(parser_code, self.module_path, tokenizer=tokenizer,
top_module=self.module, no_docstr=no_docstr) top_module=self.module, no_docstr=no_docstr)
p.module.parent = self.module p.module.parent = self.module
else: node = None
if nodes[index] != self.current_node:
offset = int(nodes[0] == self.current_node)
self.current_node.old_children.pop(index - offset)
node = nodes.pop(index)
p = node.parser
m = p.module
m.line_offset += line_offset + 1 - m.start_pos[0]
return p, node return p, node