Simplified the line splitting and with that a few other things in the fast parser.

2026-05-25 01:38:36 +08:00 · 2015-02-03 22:22:57 +01:00
parent 66dfa59286
commit e23e354fe8
2 changed files with 14 additions and 27 deletions
@@ -307,7 +307,7 @@ class Parser(object):
    def _tokenize(self, tokenizer):
        for typ, value, start_pos, prefix in tokenizer:
-            print(token.tok_name[typ], repr(value), start_pos, repr(prefix))
+            #print(token.tok_name[typ], repr(value), start_pos, repr(prefix))
            if self._omit_dedent and typ == token.DEDENT:
                self._omit_dedent -= 1
                continue
@@ -337,24 +337,21 @@ class FastParser(use_metaclass(CachedFastParser)):
        not everything.
        """
        def gen_part():
-            text = '\n'.join(current_lines)
+            text = ''.join(current_lines)
            del current_lines[:]
            self.number_of_splits += 1
-            if i == len(self._lines) - 1:
+            return text
                return text
            else:
                return text + '\n'
        def just_newlines(current_lines):
            for line in current_lines:
-                line = line.lstrip('\t ')
+                line = line.lstrip('\t \n\r')
-                if line and line[0] not in ('#', '\r'):
+                if line and line[0] != '#':
                    return False
            return True
        # Split only new lines. Distinction between \r\n is the tokenizer's
        # job.
-        self._lines = source.split('\n')
+        self._lines = source.splitlines(keepends=True)
        current_lines = []
        is_decorator = False
        current_indent = 0
@@ -364,9 +361,9 @@ class FastParser(use_metaclass(CachedFastParser)):
        # All things within flows are simply being ignored.
        for i, l in enumerate(self._lines):
            # check for dedents
-            s = l.lstrip('\t ')
+            s = l.lstrip('\t \n\r')
            indent = len(l) - len(s)
-            if not s or s[0] in ('#', '\r'):
+            if not s or s[0] == '#':
                current_lines.append(l)  # just ignore comments and blank lines
                continue
@@ -388,7 +385,6 @@ class FastParser(use_metaclass(CachedFastParser)):
                    in_flow = m.group(1) in FLOWS
                    if not is_decorator and not in_flow:
                        if not just_newlines(current_lines):
                            print('GEN', current_lines)
                            yield gen_part()
                    is_decorator = '@' == m.group(1)
                    if not is_decorator:
@@ -424,7 +420,7 @@ class FastParser(use_metaclass(CachedFastParser)):
        for code_part in self._split_parts(source):
            if not is_first:
-                print('OFF', line_offset, self.current_node.parser.module.end_pos)
+                #print('OFF', line_offset, self.current_node.parser.module.end_pos)
                #import pdb; pdb.set_trace()
                pass # TODO remove
            if is_first or line_offset + 1 == self.current_node.parser.module.end_pos[0]:
@@ -470,7 +466,6 @@ class FastParser(use_metaclass(CachedFastParser)):
            start += len(code_part)
        if added_newline:
            print('REMOVE NL', self.current_node)
            self.current_node.remove_last_newline()
        # Now that the for loop is finished, we still want to close all nodes.
@@ -498,10 +493,9 @@ class FastParser(use_metaclass(CachedFastParser)):
        """
        Side effect: Alters the list of nodes.
        """
        print('r', repr(source))
        h = hash(source)
        for index, node in enumerate(nodes):
-            print('EQ', node, repr(node.source), repr(source))
+            #print('EQ', node, repr(node.source), repr(source))
            if node.hash == h and node.source == source:
                node.reset_node()
                nodes.remove(node)
@@ -509,19 +503,13 @@ class FastParser(use_metaclass(CachedFastParser)):
        else:
            tokenizer = FastTokenizer(parser_code, 0)
            self.number_parsers_used += 1
-            print('CODE', repr(source))
+            #print('CODE', repr(source))
            p = Parser(self._grammar, parser_code, self.module_path, tokenizer=tokenizer)
            node = ParserNode(self.module)
            end = line_offset + p.module.end_pos[0]
-            if not (len(self._lines) == end):
+            used_lines = self._lines[line_offset:end - 1]
-                # We don't keep the last line, except if were done. A newline
+            code_part_actually_used = ''.join(used_lines)
                # ends on the next line, which is part of the next parser. But
                # the last parser includes the last new line.
                end -= 1
            print(line_offset, end)
            used_lines = self._lines[line_offset:end]
            code_part_actually_used = '\n'.join(used_lines)
            node.set_parser(p, code_part_actually_used)
        self.current_node.add_node(node, line_offset)
@@ -589,7 +577,7 @@ class FastTokenizer(object):
        if self.previous[0] in (NEWLINE, INDENT, DEDENT) \
                and not self._parentheses_level and typ != INDENT:
            # Check for NEWLINE, which symbolizes the indent.
-            print('X', repr(value), tokenize.tok_name[typ])
+           # print('X', repr(value), tokenize.tok_name[typ])
            if not self._in_flow:
                self._in_flow = value in FLOWS
                if self._in_flow:
@@ -606,7 +594,6 @@ class FastTokenizer(object):
                        self._first_stmt = False
                        self._expect_indent = True
                elif self._expect_indent:
                    print('EXP', self._first_stmt)
                    return self._close()
                else:
                    self._first_stmt = False