Simplified the line splitting and with that a few other things in the fast parser.

This commit is contained in:
Dave Halter
2015-02-03 22:22:57 +01:00
parent 66dfa59286
commit e23e354fe8
2 changed files with 14 additions and 27 deletions
+1 -1
View File
@@ -307,7 +307,7 @@ class Parser(object):
def _tokenize(self, tokenizer): def _tokenize(self, tokenizer):
for typ, value, start_pos, prefix in tokenizer: for typ, value, start_pos, prefix in tokenizer:
print(token.tok_name[typ], repr(value), start_pos, repr(prefix)) #print(token.tok_name[typ], repr(value), start_pos, repr(prefix))
if self._omit_dedent and typ == token.DEDENT: if self._omit_dedent and typ == token.DEDENT:
self._omit_dedent -= 1 self._omit_dedent -= 1
continue continue
+13 -26
View File
@@ -337,24 +337,21 @@ class FastParser(use_metaclass(CachedFastParser)):
not everything. not everything.
""" """
def gen_part(): def gen_part():
text = '\n'.join(current_lines) text = ''.join(current_lines)
del current_lines[:] del current_lines[:]
self.number_of_splits += 1 self.number_of_splits += 1
if i == len(self._lines) - 1: return text
return text
else:
return text + '\n'
def just_newlines(current_lines): def just_newlines(current_lines):
for line in current_lines: for line in current_lines:
line = line.lstrip('\t ') line = line.lstrip('\t \n\r')
if line and line[0] not in ('#', '\r'): if line and line[0] != '#':
return False return False
return True return True
# Split only new lines. Distinction between \r\n is the tokenizer's # Split only new lines. Distinction between \r\n is the tokenizer's
# job. # job.
self._lines = source.split('\n') self._lines = source.splitlines(keepends=True)
current_lines = [] current_lines = []
is_decorator = False is_decorator = False
current_indent = 0 current_indent = 0
@@ -364,9 +361,9 @@ class FastParser(use_metaclass(CachedFastParser)):
# All things within flows are simply being ignored. # All things within flows are simply being ignored.
for i, l in enumerate(self._lines): for i, l in enumerate(self._lines):
# check for dedents # check for dedents
s = l.lstrip('\t ') s = l.lstrip('\t \n\r')
indent = len(l) - len(s) indent = len(l) - len(s)
if not s or s[0] in ('#', '\r'): if not s or s[0] == '#':
current_lines.append(l) # just ignore comments and blank lines current_lines.append(l) # just ignore comments and blank lines
continue continue
@@ -388,7 +385,6 @@ class FastParser(use_metaclass(CachedFastParser)):
in_flow = m.group(1) in FLOWS in_flow = m.group(1) in FLOWS
if not is_decorator and not in_flow: if not is_decorator and not in_flow:
if not just_newlines(current_lines): if not just_newlines(current_lines):
print('GEN', current_lines)
yield gen_part() yield gen_part()
is_decorator = '@' == m.group(1) is_decorator = '@' == m.group(1)
if not is_decorator: if not is_decorator:
@@ -424,7 +420,7 @@ class FastParser(use_metaclass(CachedFastParser)):
for code_part in self._split_parts(source): for code_part in self._split_parts(source):
if not is_first: if not is_first:
print('OFF', line_offset, self.current_node.parser.module.end_pos) #print('OFF', line_offset, self.current_node.parser.module.end_pos)
#import pdb; pdb.set_trace() #import pdb; pdb.set_trace()
pass # TODO remove pass # TODO remove
if is_first or line_offset + 1 == self.current_node.parser.module.end_pos[0]: if is_first or line_offset + 1 == self.current_node.parser.module.end_pos[0]:
@@ -470,7 +466,6 @@ class FastParser(use_metaclass(CachedFastParser)):
start += len(code_part) start += len(code_part)
if added_newline: if added_newline:
print('REMOVE NL', self.current_node)
self.current_node.remove_last_newline() self.current_node.remove_last_newline()
# Now that the for loop is finished, we still want to close all nodes. # Now that the for loop is finished, we still want to close all nodes.
@@ -498,10 +493,9 @@ class FastParser(use_metaclass(CachedFastParser)):
""" """
Side effect: Alters the list of nodes. Side effect: Alters the list of nodes.
""" """
print('r', repr(source))
h = hash(source) h = hash(source)
for index, node in enumerate(nodes): for index, node in enumerate(nodes):
print('EQ', node, repr(node.source), repr(source)) #print('EQ', node, repr(node.source), repr(source))
if node.hash == h and node.source == source: if node.hash == h and node.source == source:
node.reset_node() node.reset_node()
nodes.remove(node) nodes.remove(node)
@@ -509,19 +503,13 @@ class FastParser(use_metaclass(CachedFastParser)):
else: else:
tokenizer = FastTokenizer(parser_code, 0) tokenizer = FastTokenizer(parser_code, 0)
self.number_parsers_used += 1 self.number_parsers_used += 1
print('CODE', repr(source)) #print('CODE', repr(source))
p = Parser(self._grammar, parser_code, self.module_path, tokenizer=tokenizer) p = Parser(self._grammar, parser_code, self.module_path, tokenizer=tokenizer)
node = ParserNode(self.module) node = ParserNode(self.module)
end = line_offset + p.module.end_pos[0] end = line_offset + p.module.end_pos[0]
if not (len(self._lines) == end): used_lines = self._lines[line_offset:end - 1]
# We don't keep the last line, except if were done. A newline code_part_actually_used = ''.join(used_lines)
# ends on the next line, which is part of the next parser. But
# the last parser includes the last new line.
end -= 1
print(line_offset, end)
used_lines = self._lines[line_offset:end]
code_part_actually_used = '\n'.join(used_lines)
node.set_parser(p, code_part_actually_used) node.set_parser(p, code_part_actually_used)
self.current_node.add_node(node, line_offset) self.current_node.add_node(node, line_offset)
@@ -589,7 +577,7 @@ class FastTokenizer(object):
if self.previous[0] in (NEWLINE, INDENT, DEDENT) \ if self.previous[0] in (NEWLINE, INDENT, DEDENT) \
and not self._parentheses_level and typ != INDENT: and not self._parentheses_level and typ != INDENT:
# Check for NEWLINE, which symbolizes the indent. # Check for NEWLINE, which symbolizes the indent.
print('X', repr(value), tokenize.tok_name[typ]) # print('X', repr(value), tokenize.tok_name[typ])
if not self._in_flow: if not self._in_flow:
self._in_flow = value in FLOWS self._in_flow = value in FLOWS
if self._in_flow: if self._in_flow:
@@ -606,7 +594,6 @@ class FastTokenizer(object):
self._first_stmt = False self._first_stmt = False
self._expect_indent = True self._expect_indent = True
elif self._expect_indent: elif self._expect_indent:
print('EXP', self._first_stmt)
return self._close() return self._close()
else: else:
self._first_stmt = False self._first_stmt = False