1
0
forked from VimPlug/jedi

Simplify the fast parser tokenizer more. Now it is more readable and less buggy (+bugfixes).

This commit is contained in:
Dave Halter
2015-01-31 20:09:44 +01:00
parent 1826f432c8
commit bc118e8047
5 changed files with 44 additions and 65 deletions

View File

@@ -177,6 +177,7 @@ class Instance(use_metaclass(CachedMetaClass, Executed)):
return names return names
def get_subscope_by_name(self, name): def get_subscope_by_name(self, name):
print(name)
sub = self.base.get_subscope_by_name(name) sub = self.base.get_subscope_by_name(name)
return get_instance_el(self._evaluator, self, sub, True) return get_instance_el(self._evaluator, self, sub, True)

View File

@@ -301,13 +301,13 @@ class Parser(object):
def _tokenize(self, tokenizer): def _tokenize(self, tokenizer):
for typ, value, start_pos, prefix in tokenizer: for typ, value, start_pos, prefix in tokenizer:
# print(token.tok_name[typ], repr(value), start_pos, repr(prefix))
if self._omit_dedent and typ == token.DEDENT: if self._omit_dedent and typ == token.DEDENT:
self._omit_dedent -= 1 self._omit_dedent -= 1
continue continue
if typ == token.OP: if typ == token.OP:
typ = token.opmap[value] typ = token.opmap[value]
print(token.tok_name[typ], repr(value), start_pos, repr(prefix))
yield typ, value, prefix, start_pos yield typ, value, prefix, start_pos
def __repr__(self): def __repr__(self):
@@ -322,7 +322,6 @@ class Parser(object):
endmarker = self.module.children[-1] endmarker = self.module.children[-1]
# The newline is either in the endmarker as a prefix or the previous # The newline is either in the endmarker as a prefix or the previous
# leaf as a newline token. # leaf as a newline token.
print('REMOVE', endmarker.start_pos)
if endmarker.prefix.endswith('\n'): if endmarker.prefix.endswith('\n'):
endmarker.prefix = endmarker.prefix[:-1] endmarker.prefix = endmarker.prefix[:-1]
last_line = re.sub('.*\n', '', endmarker.prefix) last_line = re.sub('.*\n', '', endmarker.prefix)

View File

@@ -12,6 +12,7 @@ from jedi.parser import Parser
from jedi.parser import tree as pr from jedi.parser import tree as pr
from jedi.parser import tokenize from jedi.parser import tokenize
from jedi import cache from jedi import cache
from jedi import debug
from jedi.parser.tokenize import (source_tokens, NEWLINE, from jedi.parser.tokenize import (source_tokens, NEWLINE,
ENDMARKER, INDENT, DEDENT) ENDMARKER, INDENT, DEDENT)
@@ -155,7 +156,6 @@ class ParserNode(object):
self._node_children = [] self._node_children = []
scope = self._content_scope scope = self._content_scope
scope.children = list(self._old_children) scope.children = list(self._old_children)
print('reset', scope.children)
try: try:
# This works if it's a MergedNamesDict. # This works if it's a MergedNamesDict.
# We are correcting it, because the MergedNamesDicts are artificial # We are correcting it, because the MergedNamesDicts are artificial
@@ -187,14 +187,11 @@ class ParserNode(object):
nodes should be added anymore. nodes should be added anymore.
""" """
print('CLOSE NODE', id(self), self.parent, self._node_children) print('CLOSE NODE', id(self), self.parent, self._node_children)
print(self.parser.module.names_dict, [p.parser.module.names_dict for p in
self._node_children])
# We only need to replace the dict if multiple dictionaries are used: # We only need to replace the dict if multiple dictionaries are used:
if self._node_children: if self._node_children:
dcts = [n.parser.module.names_dict for n in self._node_children] dcts = [n.parser.module.names_dict for n in self._node_children]
# Need to insert the own node as well. # Need to insert the own node as well.
dcts.insert(0, self._content_scope.names_dict) dcts.insert(0, self._content_scope.names_dict)
print('DCTS', self.parser, dcts, self._node_children)
self._content_scope.names_dict = MergedNamesDict(dcts) self._content_scope.names_dict = MergedNamesDict(dcts)
def parent_until_indent(self, indent=None): def parent_until_indent(self, indent=None):
@@ -411,19 +408,15 @@ class FastParser(use_metaclass(CachedFastParser)):
print('OFF', line_offset + 1, self.current_node.parser.module.end_pos) print('OFF', line_offset + 1, self.current_node.parser.module.end_pos)
#import pdb; pdb.set_trace() #import pdb; pdb.set_trace()
if is_first or line_offset + 1 == self.current_node.parser.module.end_pos[0]: if is_first or line_offset + 1 == self.current_node.parser.module.end_pos[0]:
print(repr(code_part))
indent = len(code_part) - len(code_part.lstrip('\t ')) indent = len(code_part) - len(code_part.lstrip('\t '))
self.current_node = self.current_node.parent_until_indent(indent) self.current_node = self.current_node.parent_until_indent(indent)
print('cur', id(self.current_node)) # print '#'*45,line_offset, p.module.end_pos, '\n', code_part
# check if code_part has already been parsed # check if code_part has already been parsed
# print '#'*45,line_offset, p and p.module.end_pos, '\n', code_part
self.current_node = self._get_node(code_part, source[start:], self.current_node = self._get_node(code_part, source[start:],
line_offset, nodes, not is_first) line_offset, nodes, not is_first)
if False and is_first and self.current_node.parser.module.subscopes: if False and is_first and self.current_node.parser.module.subscopes:
print('NOXXXX')
raise NotImplementedError raise NotImplementedError
# Special case, we cannot use a function subscope as a # Special case, we cannot use a function subscope as a
# base scope, subscopes would save all the other contents # base scope, subscopes would save all the other contents
@@ -474,6 +467,9 @@ class FastParser(use_metaclass(CachedFastParser)):
""" TODO used? """ TODO used?
self.module.end_pos = self.parsers[-1].module.end_pos self.module.end_pos = self.parsers[-1].module.end_pos
""" """
debug.dbg('Parsed %s, with %s parsers in %s splits.'
% (self.module_path, self.number_parsers_used,
self.number_of_splits))
# print(self.parsers[0].module.get_code()) # print(self.parsers[0].module.get_code())
@@ -483,7 +479,7 @@ class FastParser(use_metaclass(CachedFastParser)):
""" """
h = hash(source) h = hash(source)
for index, node in enumerate(nodes): for index, node in enumerate(nodes):
print('EQ', node, repr(node.source), repr(source)) #print('EQ', node, repr(node.source), repr(source))
if node.hash == h and node.source == source: if node.hash == h and node.source == source:
node.reset_node() node.reset_node()
nodes.remove(node) nodes.remove(node)
@@ -497,8 +493,7 @@ class FastParser(use_metaclass(CachedFastParser)):
node = ParserNode(self.module) node = ParserNode(self.module)
end = line_offset + p.module.end_pos[0] end = line_offset + p.module.end_pos[0]
print('\nACTUALLY PARSING', p.module.end_pos, repr(source), print('\nACTUALLY PARSING', p.module.end_pos, repr(source), len(self._lines), line_offset)
len(self._lines), line_offset)
if not (len(self._lines) == end): if not (len(self._lines) == end):
# We don't keep the last line, except if were done. A newline # We don't keep the last line, except if were done. A newline
# ends on the next line, which is part of the next parser. But # ends on the next line, which is part of the next parser. But
@@ -525,15 +520,12 @@ class FastTokenizer(object):
# fast parser options # fast parser options
self.current = self.previous = NEWLINE, '', (0, 0) self.current = self.previous = NEWLINE, '', (0, 0)
self._in_flow = False self._in_flow = False
self._new_indent = False
self._parser_indent = self._old_parser_indent = 0
self._is_decorator = False self._is_decorator = False
self._first_stmt = True self._first_stmt = True
self._parentheses_level = 0 self._parentheses_level = 0
self._indent_counter = 0 self._indent_counter = 0
self._flow_indent_counter = 0 self._flow_indent_counter = 0
self._returned_endmarker = False self._returned_endmarker = False
self._next_dedent_noclose = False
def __iter__(self): def __iter__(self):
return self return self
@@ -555,65 +547,35 @@ class FastTokenizer(object):
self.previous = self.current self.previous = self.current
self.current = current self.current = current
# this is exactly the same check as in fast_parser, but this time with
# tokenize and therefore precise.
breaks = ['def', 'class', '@']
if typ == INDENT: if typ == INDENT:
self._indent_counter += 1 self._indent_counter += 1
elif typ == DEDENT: elif typ == DEDENT:
self._indent_counter -= 1 self._indent_counter -= 1
print('DEDENT', self._flow_indent_counter, start_pos, self._indent_counter) print(self._in_flow, self._indent_counter, self._flow_indent_counter)
if self._in_flow and self._indent_counter == self._flow_indent_counter: if self._in_flow and self._indent_counter == self._flow_indent_counter:
self._in_flow = False self._in_flow = False
self._next_dedent_noclose = True elif not self._in_flow:
self._closed = True
return current return current
if self.previous[0] in (NEWLINE, INDENT, DEDENT): # Parentheses ignore the indentation rules. The other three stand for
if self.previous[0] == DEDENT: # new lines.
if not self._in_flow: if self.previous[0] in (NEWLINE, INDENT, DEDENT) \
if not self._next_dedent_noclose: and not self._parentheses_level:
self._first_stmt = False
return self._close()
self._next_dedent_noclose = False
# Check for NEWLINE, which symbolizes the indent. # Check for NEWLINE, which symbolizes the indent.
#print('X', repr(value), tokenize.tok_name[typ]) #print('X', repr(value), tokenize.tok_name[typ])
indent = start_pos[1]
#print(indent, self._parser_indent)
if self._parentheses_level:
# Parentheses ignore the indentation rules.
pass
elif False and indent < self._parser_indent: # -> dedent
raise NotImplementedError
self._parser_indent = indent
self._new_indent = False
print(self._in_flow, indent, self._old_parser_indent)
if not self._in_flow or indent < self._old_parser_indent:
return self._close()
self._in_flow = False
elif self._new_indent:
self._parser_indent = indent
self._new_indent = False
if not self._in_flow: if not self._in_flow:
self._in_flow = value in FLOWS self._in_flow = value in FLOWS
if self._in_flow: if self._in_flow:
print('INFLOW', self._indent_counter)
self._flow_indent_counter = self._indent_counter self._flow_indent_counter = self._indent_counter
#self._old_parser_indent = self._parser_indent elif value in ('def', 'class', '@'):
#self._parser_indent += 1 # new scope: must be higher # The values here are exactly the same check as in
#self._new_indent = True # _split_parts, but this time with tokenize and therefore
elif value in breaks: # precise.
if not self._is_decorator: if not self._is_decorator:
return self._close() return self._close()
self._is_decorator = '@' == value self._is_decorator = '@' == value
#if not self._is_decorator:
#self._old_parser_indent = self._parser_indent
#self._parser_indent += 1 # new scope: must be higher
#self._new_indent = True
if value in '([{' and value: if value in '([{' and value:
self._parentheses_level += 1 self._parentheses_level += 1
@@ -628,8 +590,6 @@ class FastTokenizer(object):
# Continue like nothing has happened, because we want to enter # Continue like nothing has happened, because we want to enter
# the first class/function. # the first class/function.
if self.current[1] != '@': if self.current[1] != '@':
#if self._first_stmt and not self._new_indent:
#self._parser_indent = indent
self._first_stmt = False self._first_stmt = False
return self.current return self.current
else: else:
@@ -637,18 +597,19 @@ class FastTokenizer(object):
return self._finish_dedents() return self._finish_dedents()
def _finish_dedents(self): def _finish_dedents(self):
start_pos = self.current[2]
print('FINISH', self._indent_counter)
if self._indent_counter: if self._indent_counter:
self._indent_counter -= 1 self._indent_counter -= 1
return tokenize.DEDENT, '', start_pos, '' return tokenize.DEDENT, '', self.current[2], ''
elif not self._returned_endmarker: elif not self._returned_endmarker:
self._returned_endmarker = True self._returned_endmarker = True
# We're using the current prefix for the endmarker to not loose any # We're using the current prefix for the endmarker to not loose any
# information. However we care about "lost" lines. The prefix of # information. However we care about "lost" lines. The prefix of
# the current line (indent) will always be included in the current # the current line (indent) will always be included in the current
# line. # line.
prefix = re.sub('[^\n]+$', '', self.current[3]) t, _, start_pos, prefix = next(self._gen)
# \Z for the end of the string. $ is bugged, because it has the
# same behavior with or without re.MULTILINE.
prefix = re.sub(r'[^\n]+\Z', '', prefix)
return ENDMARKER, '', start_pos, prefix return ENDMARKER, '', start_pos, prefix
else: else:
raise StopIteration raise StopIteration

View File

@@ -706,7 +706,7 @@ class Class(ClassOrFunc):
super(Class, self).__init__(children) super(Class, self).__init__(children)
def get_super_arglist(self): def get_super_arglist(self):
if len(self.children) == 4: # Has no parentheses if self.children[2] != '(': # Has no parentheses
return None return None
else: else:
if self.children[3] == ')': # Empty parentheses if self.children[3] == ')': # Empty parentheses

View File

@@ -180,6 +180,24 @@ def test_nested_funcs():
check_fp(src, 3) check_fp(src, 3)
def test_class_and_if():
src = dedent("""\
class V:
def __init__(self):
pass
if 1:
c = 3
def a_func():
return 1
# COMMENT
a_func()""")
check_fp(src, 5, 5)
assert [d.name for d in jedi.Script(src).goto_definitions()] == ['int']
def test_func_with_for_and_comment(): def test_func_with_for_and_comment():
# The first newline is important, leave it. It should not trigger another # The first newline is important, leave it. It should not trigger another
# parser split. # parser split.