on the way to a better fast_parser - improved a lot of the positioning stuff

2013-03-24 22:51:17 +04:30
parent df058b93c2
commit a99d9541bd
3 changed files with 101 additions and 40 deletions
--- a/jedi/common.py
+++ b/jedi/common.py
@@ -6,6 +6,8 @@ from _compatibility import next
 import debug
 import settings

+FLOWS = ['if', 'else', 'elif', 'while', 'with', 'try', 'except', 'finally']
+

 class MultiLevelStopIteration(Exception):
    """
@@ -56,14 +58,21 @@ class PushBackIterator(object):


 class NoErrorTokenizer(object):
-    def __init__(self, readline, offset=(0, 0), stop_on_scope=False):
+    def __init__(self, readline, offset=(0, 0), is_fast_parser=False):
        self.readline = readline
        self.gen = PushBackIterator(tokenize.generate_tokens(readline))
        self.offset = offset
-        self.stop_on_scope = stop_on_scope
-        self.first_scope = False
        self.closed = False
-        self.first = True
+        self.is_first = True
+
+        # fast parser options
+        self.is_fast_parser = is_fast_parser
+        self.current = self.previous = [None, None, (0, 0), (0, 0), '']
+        self.in_flow = False
+        self.new_indent = False
+        self.parser_indent = 0
+        self.is_decorator = False
+        self.first_stmt = True

    def push_last_back(self):
        self.gen.push_back(self.current)
@@ -76,6 +85,8 @@ class NoErrorTokenizer(object):
        if self.closed:
            raise MultiLevelStopIteration()
        try:
+            last_previous = self.previous
+            self.previous = self.current
            self.current = next(self.gen)
        except tokenize.TokenError:
            # We just ignore this error, I try to handle it earlier - as
@@ -99,22 +110,60 @@ class NoErrorTokenizer(object):

        c = list(self.current)

-        # stop if a new class or definition is started at position zero.
-        breaks = ['def', 'class', '@']
-        if self.stop_on_scope and c[1] in breaks and c[2][1] == 0:
-            if self.first_scope:
-                self.closed = True
-                raise MultiLevelStopIteration()
-            elif c[1] != '@':
-                self.first_scope = True
+        if c[0] == tokenize.ENDMARKER:
+            self.current = self.previous
+            self.previous = last_previous
+            raise MultiLevelStopIteration()

-        if self.first:
+        # this is exactly the same check as in fast_parser, but this time with
+        # tokenize and therefore precise.
+        breaks = ['def', 'class', '@']
+
+        if self.is_first:
            c[2] = self.offset[0] + c[2][0], self.offset[1] + c[2][1]
            c[3] = self.offset[0] + c[3][0], self.offset[1] + c[3][1]
-            self.first = False
+            self.is_first = False
        else:
            c[2] = self.offset[0] + c[2][0], c[2][1]
            c[3] = self.offset[0] + c[3][0], c[3][1]
+        print 'h', c, tokenize.tok_name[c[0]], self.current[2:4]
+        self.current = c
+
+        def close():
+            if not self.first_stmt:
+                self.closed = True
+                raise MultiLevelStopIteration()
+        # ignore indents/comments
+        if self.is_fast_parser \
+                and self.previous[0] in (tokenize.INDENT, tokenize.NL, None,
+                                         tokenize.NEWLINE, tokenize.DEDENT) \
+                and c[0] not in (tokenize.COMMENT, tokenize.INDENT,
+                             tokenize.NL, tokenize.NEWLINE, tokenize.DEDENT):
+            print c, tokenize.tok_name[c[0]]
+
+            tok = c[1]
+            indent = c[2][1]
+            if indent < self.parser_indent:  # -> dedent
+                self.parser_indent = indent
+                self.new_indent = False
+                if not self.in_flow:
+                    close()
+                self.in_flow = False
+            elif self.new_indent:
+                self.parser_indent = indent
+
+            if not self.in_flow:
+                if tok in FLOWS or tok in breaks:
+                    self.in_flow = tok in FLOWS
+                    if not self.is_decorator and not self.in_flow:
+                        close()
+                    self.is_decorator = '@' == tok
+                    if not self.is_decorator:
+                        self.parser_indent += 1  # new scope: must be higher
+                        self.new_indent = True
+
+            if tok != '@':
+                self.first_stmt = False
        return c


--- a/jedi/fast_parser.py
+++ b/jedi/fast_parser.py
@@ -11,6 +11,7 @@ import settings
 import parsing
 import parsing_representation as pr
 import cache
+import common


 class Module(pr.Simple, pr.Module):
@@ -219,17 +220,15 @@ class FastParser(use_metaclass(CachedFastParser)):
                parts.append(txt)
                current_lines[:] = []

-        flows = ['if', 'else', 'elif', 'while', 'with', 'try', 'except',
-                 'finally']
-        r_keyword = '^[ \t]*(def|class|@|%s)' % '|'.join(flows)
+        r_keyword = '^[ \t]*(def|class|@|%s)' % '|'.join(common.FLOWS)

        lines = code.splitlines()
        current_lines = []
        parts = []
-        is_generator = False
+        is_decorator = False
        current_indent = 0
        new_indent = False
-        is_flow = False
+        in_flow = False
        # All things within flows are simply being ignored.
        for i, l in enumerate(lines):
            # check for dedents
@@ -242,29 +241,35 @@ class FastParser(use_metaclass(CachedFastParser)):
            if indent < current_indent:  # -> dedent
                current_indent = indent
                new_indent = False
-                if not is_flow:
+                if not in_flow:
                    add_part()
-                is_flow = False
+                in_flow = False
            elif new_indent:
                current_indent = indent
                new_indent = False

            # Check lines for functions/classes and split the code there.
-            if not is_flow:
+            if not in_flow:
                m = re.match(r_keyword, l)
                if m:
-                    is_flow = m.group(1) in flows
-                    if not is_generator and not is_flow:
+                    in_flow = m.group(1) in common.FLOWS
+                    if not is_decorator and not in_flow:
                        add_part()
                        current_lines = []
-                    is_generator = '@' == m.group(1)
-                    if not is_generator:
+                    is_decorator = '@' == m.group(1)
+                    if not is_decorator:
                        current_indent += 1  # it must be higher
                        new_indent = True

            current_lines.append(l)
        add_part()

+        for p in parts:
+            #print '#####################################'
+            #print p
+            #print len(p.splitlines())
+            pass
+
        return parts

    def _parse(self, code):
@@ -280,11 +285,12 @@ class FastParser(use_metaclass(CachedFastParser)):
                        el = module.imports[0]
                return el.start_pos[1]

-            if self.parsers:
+            if self.parsers and False:
                new_indent = get_indent(module)
                old_indent = get_indent(self.parsers[-1].module)
                if old_indent < new_indent:
-                    module.parent = self.parsers[-1].module.subscopes[0]
+                    #module.parent = self.parsers[-1].module.subscopes[0]
+                    # TODO set parents + add to subscopes
                    return
            p.module.parent = self.module

@@ -301,7 +307,7 @@ class FastParser(use_metaclass(CachedFastParser)):
        p = None
        parser_order = 0
        for code_part in parts:
-            lines = code_part.count('\n')
+            lines = code_part.count('\n') + 1
            # the parser is using additional newlines, therefore substract
            if p is None or line_offset >= p.end_pos[0] - 2:
                # check if code_part has already been parsed
@@ -336,8 +342,13 @@ class FastParser(use_metaclass(CachedFastParser)):

                parser_order += 1
            line_offset += lines
-            start += len(code_part)
+            print line_offset
+            start += len(code_part) + 1  # +1 for newline
        self.parsers[parser_order + 1:] = []
+        for p in self.parsers:
+            print(p.module.get_code())
+            print(p.module.start_pos, p.module.end_pos)
+        exit()

    def reset_caches(self):
        self._user_scope = None
--- a/jedi/parsing.py
+++ b/jedi/parsing.py
@@ -52,13 +52,11 @@ class Parser(object):
        self.user_stmt = None
        self.no_docstr = no_docstr

+        self.start_pos = self.end_pos = 1 + offset[0], offset[1]
        # initialize global Scope
-        self.module = pr.SubModule(module_path, (offset[0] + 1, offset[1]),
-                                                            top_module)
+        self.module = pr.SubModule(module_path, self.start_pos, top_module)
        self.scope = self.module
        self.current = (None, None)
-        self.start_pos = 1, 0
-        self.end_pos = 1, 0

        source = source + '\n'  # end with \n, because the parser needs it
        buf = StringIO(source)
@@ -79,6 +77,10 @@ class Parser(object):
            # because of `self.module.used_names`.
            d.parent = self.module

+        if self.current[0] in (tokenize.NL, tokenize.NEWLINE):
+            # we added a newline before, so we need to "remove" it again.
+            self.end_pos = self._gen.previous[2]
+
        self.start_pos = self.module.start_pos
        self.module.end_pos = self.end_pos
        del self._gen
@@ -170,8 +172,6 @@ class Parser(object):
        while True:
            defunct = False
            token_type, tok = self.next()
-            if token_type == tokenize.ENDMARKER:
-                break
            if brackets and tok == '\n':
                self.next()
            if tok == '(':  # python allows only one `(` in the statement.
@@ -421,8 +421,10 @@ class Parser(object):
    def __next__(self):
        """ Generate the next tokenize pattern. """
        try:
-            typ, tok, self.start_pos, self.end_pos, \
-                                self.parserline = next(self._gen)
+            typ, tok, start_pos, end_pos, self.parserline = next(self._gen)
+            # dedents shouldn't change positions
+            if typ != tokenize.DEDENT:
+                self.start_pos, self.end_pos = start_pos, end_pos
        except (StopIteration, common.MultiLevelStopIteration):
            # on finish, set end_pos correctly
            s = self.scope
@@ -662,7 +664,6 @@ class Parser(object):
                self.freshscope = False
            else:
                if token_type not in [tokenize.COMMENT, tokenize.INDENT,
-                                      tokenize.NEWLINE, tokenize.NL,
-                                      tokenize.ENDMARKER]:
+                                      tokenize.NEWLINE, tokenize.NL]:
                    debug.warning('token not classified', tok, token_type,
                                                        self.start_pos[0])