Fix another very annoying fast parser issue.

This commit is contained in:
Dave Halter
2016-06-06 08:37:40 +02:00
parent dd85fc6ffd
commit 436f7dffe0
3 changed files with 56 additions and 31 deletions

View File

@@ -132,6 +132,9 @@ class ParserNode(object):
# We need to be able to reset the original children of a parser. # We need to be able to reset the original children of a parser.
self._old_children = list(self._content_scope.children) self._old_children = list(self._content_scope.children)
def is_root_node(self):
return self.parent is None
def _rewrite_last_newline(self): def _rewrite_last_newline(self):
""" """
The ENDMARKER can contain a newline in the prefix. However this prefix The ENDMARKER can contain a newline in the prefix. However this prefix
@@ -182,21 +185,25 @@ class ParserNode(object):
dcts.insert(0, self._content_scope.names_dict) dcts.insert(0, self._content_scope.names_dict)
self._content_scope.names_dict = MergedNamesDict(dcts) self._content_scope.names_dict = MergedNamesDict(dcts)
def parent_until_indent(self, indent=None):
if (indent is None or self._indent >= indent) and self.parent is not None:
self.close()
return self.parent.parent_until_indent(indent)
return self
@property @property
def _indent(self): def _indent(self):
if not self.parent: if self.is_root_node():
return 0 return 0
return self.parser.module.children[0].start_pos[1] return self.parser.module.children[0].start_pos[1]
def add_node(self, node, line_offset): def add_node(self, node, line_offset, indent):
"""Adding a node means adding a node that was already added earlier""" """
Adding a node means adding a node that was either just parsed or one
that can be reused.
"""
print(indent, self._indent)
#if indent > 0:
#import pdb;pdb.set_trace()
if self._indent >= indent and not self.is_root_node():
self.close()
return self.parent.add_node(node, line_offset, indent)
# Changing the line offsets is very important, because if they don't # Changing the line offsets is very important, because if they don't
# fit, all the start_pos values will be wrong. # fit, all the start_pos values will be wrong.
m = node.parser.module m = node.parser.module
@@ -245,11 +252,12 @@ class FastParser(use_metaclass(CachedFastParser)):
def _reset_caches(self): def _reset_caches(self):
self.module = FastModule(self.module_path) self.module = FastModule(self.module_path)
self.current_node = ParserNode(self.module, self, '') self.root_node = self.current_node = ParserNode(self.module, self, '')
def update(self, source): def update(self, source):
# For testing purposes: It is important that the number of parsers used # Variables for testing purposes: It is important that the number of
# can be minimized. With these variables we can test against that. # parsers used can be minimized. With these variables we can test
# against that.
self.number_parsers_used = 0 self.number_parsers_used = 0
self.number_of_splits = 0 self.number_of_splits = 0
self.number_of_misses = 0 self.number_of_misses = 0
@@ -349,8 +357,8 @@ class FastParser(use_metaclass(CachedFastParser)):
is_decorator = False is_decorator = False
parentheses_level = \ parentheses_level = \
max(0, (l.count('(') + l.count('[') + l.count('{') max(0, (l.count('(') + l.count('[') + l.count('{') -
- l.count(')') - l.count(']') - l.count('}'))) l.count(')') - l.count(']') - l.count('}')))
current_lines.append(l) current_lines.append(l)
@@ -372,9 +380,10 @@ class FastParser(use_metaclass(CachedFastParser)):
next_line_offset = line_offset = 0 next_line_offset = line_offset = 0
start = 0 start = 0
nodes = list(self.current_node.all_sub_nodes()) nodes = list(self.root_node.all_sub_nodes())
# Now we can reset the node, because we have all the old nodes. # Now we can reset the node, because we have all the old nodes.
self.current_node.reset_node() self.root_node.reset_node()
self.current_node = self.root_node
last_end_line = 1 last_end_line = 1
for code_part in self._split_parts(source): for code_part in self._split_parts(source):
@@ -383,8 +392,7 @@ class FastParser(use_metaclass(CachedFastParser)):
# we know that the parser went further (`def` start in a # we know that the parser went further (`def` start in a
# docstring). So just parse the next part. # docstring). So just parse the next part.
if line_offset + 1 == last_end_line: if line_offset + 1 == last_end_line:
self.current_node = self._get_node(code_part, source[start:], self._parse_part(code_part, source[start:], line_offset, nodes)
line_offset, nodes)
else: else:
# Means that some lines where not fully parsed. Parse it now. # Means that some lines where not fully parsed. Parse it now.
# This is a very rare case. Should only happens with very # This is a very rare case. Should only happens with very
@@ -397,8 +405,7 @@ class FastParser(use_metaclass(CachedFastParser)):
# complicated and error-prone. Since this is not very often # complicated and error-prone. Since this is not very often
# called - just ignore it. # called - just ignore it.
src = ''.join(self._lines[line_offset:]) src = ''.join(self._lines[line_offset:])
self.current_node = self._get_node(code_part, src, self._parse_part(code_part, src, line_offset, nodes)
line_offset, nodes)
last_end_line = self.current_node.parser.module.end_pos[0] last_end_line = self.current_node.parser.module.end_pos[0]
debug.dbg('While parsing %s, line %s slowed down the fast parser.', debug.dbg('While parsing %s, line %s slowed down the fast parser.',
@@ -413,14 +420,16 @@ class FastParser(use_metaclass(CachedFastParser)):
self.current_node.remove_last_newline() self.current_node.remove_last_newline()
# Now that the for loop is finished, we still want to close all nodes. # Now that the for loop is finished, we still want to close all nodes.
self.current_node = self.current_node.parent_until_indent() node = self.current_node
self.current_node.close() while node is not None:
node.close()
node = node.parent
debug.dbg('Parsed %s, with %s parsers in %s splits.' debug.dbg('Parsed %s, with %s parsers in %s splits.'
% (self.module_path, self.number_parsers_used, % (self.module_path, self.number_parsers_used,
self.number_of_splits)) self.number_of_splits))
def _get_node(self, source, parser_code, line_offset, nodes): def _parse_part(self, source, parser_code, line_offset, nodes):
""" """
Side effect: Alters the list of nodes. Side effect: Alters the list of nodes.
""" """
@@ -443,10 +452,9 @@ class FastParser(use_metaclass(CachedFastParser)):
node = ParserNode(self.module, p, code_part_actually_used) node = ParserNode(self.module, p, code_part_actually_used)
indent = len(parser_code) - len(parser_code.lstrip('\t ')) indent = len(parser_code) - len(parser_code.lstrip('\t '))
self.current_node = self.current_node.parent_until_indent(indent)
self.current_node.add_node(node, line_offset) self.current_node.add_node(node, line_offset, indent)
return node self.current_node = node
class FastTokenizer(object): class FastTokenizer(object):
@@ -506,19 +514,22 @@ class FastTokenizer(object):
self._closed = True self._closed = True
return current return current
if value in ('def', 'class') and self._parentheses_level \ previous_type = self.previous[0]
and re.search(r'\n[ \t]*\Z', prefix): if value in ('def', 'class') and self._parentheses_level:
# Account for the fact that an open parentheses before a function # Account for the fact that an open parentheses before a function
# will reset the parentheses counter, but new lines before will # will reset the parentheses counter, but new lines before will
# still be ignored. So check the prefix. # still be ignored. So check the prefix.
# TODO what about flow parentheses counter resets in the tokenizer? # TODO what about flow parentheses counter resets in the tokenizer?
self._parentheses_level = 0 self._parentheses_level = 0
return self._close() # We need to simulate a newline before the indent, because the
# open parentheses ignored them.
previous_type = NEWLINE
print('x', typ, repr(value), self.previous)
# Parentheses ignore the indentation rules. The other three stand for # Parentheses ignore the indentation rules. The other three stand for
# new lines. # new lines.
if self.previous[0] in (NEWLINE, INDENT, DEDENT) \ if previous_type in (NEWLINE, INDENT, DEDENT) \
and not self._parentheses_level and typ not in (INDENT, DEDENT): and not self._parentheses_level and typ not in (INDENT, DEDENT):
if not self._in_flow: if not self._in_flow:
if value in FLOWS: if value in FLOWS:

View File

@@ -48,13 +48,16 @@ class TestCallSignatures(TestCase):
run(s3, None, column=5) run(s3, None, column=5)
run(s3, None) run(s3, None)
# more complicated def test_more_complicated(self):
run = self._run_simple
'''
s4 = 'abs(zip(), , set,' s4 = 'abs(zip(), , set,'
run(s4, None, column=3) run(s4, None, column=3)
run(s4, 'abs', 0, 4) run(s4, 'abs', 0, 4)
run(s4, 'zip', 0, 8) run(s4, 'zip', 0, 8)
run(s4, 'abs', 0, 9) run(s4, 'abs', 0, 9)
#run(s4, 'abs', 1, 10) #run(s4, 'abs', 1, 10)
'''
s5 = "sorted(1,\nif 2:\n def a():" s5 = "sorted(1,\nif 2:\n def a():"
run(s5, 'sorted', 0, 7) run(s5, 'sorted', 0, 7)

View File

@@ -423,6 +423,17 @@ def test_fake_parentheses():
check_fp(src, 3, 2, 1) check_fp(src, 3, 2, 1)
def test_additional_indent():
source = dedent('''\
int(
def x():
pass
''')
x = check_fp(source, 2)
import pdb; pdb.set_trace()
def test_incomplete_function(): def test_incomplete_function():
source = '''return ImportErr''' source = '''return ImportErr'''