From a99d9541bd727b688ccd550f3731ec351f4f03d9 Mon Sep 17 00:00:00 2001
From: David Halter <davidhalter88@gmail.com>
Date: Sun, 24 Mar 2013 22:51:17 +0430
Subject: [PATCH] on the way to a better fast_parser - improved a lot of the
 positioning stuff

---
 jedi/common.py      | 77 ++++++++++++++++++++++++++++++++++++---------
 jedi/fast_parser.py | 43 +++++++++++++++----------
 jedi/parsing.py     | 21 +++++++------
 3 files changed, 101 insertions(+), 40 deletions(-)

diff --git a/jedi/common.py b/jedi/common.py
index 35a10f62..cfa374d4 100644
--- a/jedi/common.py
+++ b/jedi/common.py
@@ -6,6 +6,8 @@ from _compatibility import next
 import debug
 import settings
 
+FLOWS = ['if', 'else', 'elif', 'while', 'with', 'try', 'except', 'finally']
+
 
 class MultiLevelStopIteration(Exception):
     """
@@ -56,14 +58,21 @@ class PushBackIterator(object):
 
 
 class NoErrorTokenizer(object):
-    def __init__(self, readline, offset=(0, 0), stop_on_scope=False):
+    def __init__(self, readline, offset=(0, 0), is_fast_parser=False):
         self.readline = readline
         self.gen = PushBackIterator(tokenize.generate_tokens(readline))
         self.offset = offset
-        self.stop_on_scope = stop_on_scope
-        self.first_scope = False
         self.closed = False
-        self.first = True
+        self.is_first = True
+
+        # fast parser options
+        self.is_fast_parser = is_fast_parser
+        self.current = self.previous = [None, None, (0, 0), (0, 0), '']
+        self.in_flow = False
+        self.new_indent = False
+        self.parser_indent = 0
+        self.is_decorator = False
+        self.first_stmt = True
 
     def push_last_back(self):
         self.gen.push_back(self.current)
@@ -76,6 +85,8 @@ class NoErrorTokenizer(object):
         if self.closed:
             raise MultiLevelStopIteration()
         try:
+            last_previous = self.previous
+            self.previous = self.current
             self.current = next(self.gen)
         except tokenize.TokenError:
             # We just ignore this error, I try to handle it earlier - as
@@ -99,22 +110,60 @@ class NoErrorTokenizer(object):
 
         c = list(self.current)
 
-        # stop if a new class or definition is started at position zero.
-        breaks = ['def', 'class', '@']
-        if self.stop_on_scope and c[1] in breaks and c[2][1] == 0:
-            if self.first_scope:
-                self.closed = True
-                raise MultiLevelStopIteration()
-            elif c[1] != '@':
-                self.first_scope = True
+        if c[0] == tokenize.ENDMARKER:
+            self.current = self.previous
+            self.previous = last_previous
+            raise MultiLevelStopIteration()
 
-        if self.first:
+        # this is exactly the same check as in fast_parser, but this time with
+        # tokenize and therefore precise.
+        breaks = ['def', 'class', '@']
+
+        if self.is_first:
             c[2] = self.offset[0] + c[2][0], self.offset[1] + c[2][1]
             c[3] = self.offset[0] + c[3][0], self.offset[1] + c[3][1]
-            self.first = False
+            self.is_first = False
         else:
             c[2] = self.offset[0] + c[2][0], c[2][1]
             c[3] = self.offset[0] + c[3][0], c[3][1]
+        print 'h', c, tokenize.tok_name[c[0]], self.current[2:4]
+        self.current = c
+
+        def close():
+            if not self.first_stmt:
+                self.closed = True
+                raise MultiLevelStopIteration()
+        # ignore indents/comments
+        if self.is_fast_parser \
+                and self.previous[0] in (tokenize.INDENT, tokenize.NL, None,
+                                         tokenize.NEWLINE, tokenize.DEDENT) \
+                and c[0] not in (tokenize.COMMENT, tokenize.INDENT,
+                             tokenize.NL, tokenize.NEWLINE, tokenize.DEDENT):
+            print c, tokenize.tok_name[c[0]]
+
+            tok = c[1]
+            indent = c[2][1]
+            if indent < self.parser_indent:  # -> dedent
+                self.parser_indent = indent
+                self.new_indent = False
+                if not self.in_flow:
+                    close()
+                self.in_flow = False
+            elif self.new_indent:
+                self.parser_indent = indent
+
+            if not self.in_flow:
+                if tok in FLOWS or tok in breaks:
+                    self.in_flow = tok in FLOWS
+                    if not self.is_decorator and not self.in_flow:
+                        close()
+                    self.is_decorator = '@' == tok
+                    if not self.is_decorator:
+                        self.parser_indent += 1  # new scope: must be higher
+                        self.new_indent = True
+
+            if tok != '@':
+                self.first_stmt = False
         return c
 
 
diff --git a/jedi/fast_parser.py b/jedi/fast_parser.py
index e3b74c44..8811d290 100644
--- a/jedi/fast_parser.py
+++ b/jedi/fast_parser.py
@@ -11,6 +11,7 @@ import settings
 import parsing
 import parsing_representation as pr
 import cache
+import common
 
 
 class Module(pr.Simple, pr.Module):
@@ -219,17 +220,15 @@ class FastParser(use_metaclass(CachedFastParser)):
                 parts.append(txt)
                 current_lines[:] = []
 
-        flows = ['if', 'else', 'elif', 'while', 'with', 'try', 'except',
-                 'finally']
-        r_keyword = '^[ \t]*(def|class|@|%s)' % '|'.join(flows)
+        r_keyword = '^[ \t]*(def|class|@|%s)' % '|'.join(common.FLOWS)
 
         lines = code.splitlines()
         current_lines = []
         parts = []
-        is_generator = False
+        is_decorator = False
         current_indent = 0
         new_indent = False
-        is_flow = False
+        in_flow = False
         # All things within flows are simply being ignored.
         for i, l in enumerate(lines):
             # check for dedents
@@ -242,29 +241,35 @@ class FastParser(use_metaclass(CachedFastParser)):
             if indent < current_indent:  # -> dedent
                 current_indent = indent
                 new_indent = False
-                if not is_flow:
+                if not in_flow:
                     add_part()
-                is_flow = False
+                in_flow = False
             elif new_indent:
                 current_indent = indent
                 new_indent = False
 
             # Check lines for functions/classes and split the code there.
-            if not is_flow:
+            if not in_flow:
                 m = re.match(r_keyword, l)
                 if m:
-                    is_flow = m.group(1) in flows
-                    if not is_generator and not is_flow:
+                    in_flow = m.group(1) in common.FLOWS
+                    if not is_decorator and not in_flow:
                         add_part()
                         current_lines = []
-                    is_generator = '@' == m.group(1)
-                    if not is_generator:
+                    is_decorator = '@' == m.group(1)
+                    if not is_decorator:
                         current_indent += 1  # it must be higher
                         new_indent = True
 
             current_lines.append(l)
         add_part()
 
+        for p in parts:
+            #print '#####################################'
+            #print p
+            #print len(p.splitlines())
+            pass
+
         return parts
 
     def _parse(self, code):
@@ -280,11 +285,12 @@ class FastParser(use_metaclass(CachedFastParser)):
                         el = module.imports[0]
                 return el.start_pos[1]
 
-            if self.parsers:
+            if self.parsers and False:
                 new_indent = get_indent(module)
                 old_indent = get_indent(self.parsers[-1].module)
                 if old_indent < new_indent:
-                    module.parent = self.parsers[-1].module.subscopes[0]
+                    #module.parent = self.parsers[-1].module.subscopes[0]
+                    # TODO set parents + add to subscopes
                     return
             p.module.parent = self.module
 
@@ -301,7 +307,7 @@ class FastParser(use_metaclass(CachedFastParser)):
         p = None
         parser_order = 0
         for code_part in parts:
-            lines = code_part.count('\n')
+            lines = code_part.count('\n') + 1
             # the parser is using additional newlines, therefore substract
             if p is None or line_offset >= p.end_pos[0] - 2:
                 # check if code_part has already been parsed
@@ -336,8 +342,13 @@ class FastParser(use_metaclass(CachedFastParser)):
 
                 parser_order += 1
             line_offset += lines
-            start += len(code_part)
+            print line_offset
+            start += len(code_part) + 1  # +1 for newline
         self.parsers[parser_order + 1:] = []
+        for p in self.parsers:
+            print(p.module.get_code())
+            print(p.module.start_pos, p.module.end_pos)
+        exit()
 
     def reset_caches(self):
         self._user_scope = None
diff --git a/jedi/parsing.py b/jedi/parsing.py
index 06036193..0ce923be 100644
--- a/jedi/parsing.py
+++ b/jedi/parsing.py
@@ -52,13 +52,11 @@ class Parser(object):
         self.user_stmt = None
         self.no_docstr = no_docstr
 
+        self.start_pos = self.end_pos = 1 + offset[0], offset[1]
         # initialize global Scope
-        self.module = pr.SubModule(module_path, (offset[0] + 1, offset[1]),
-                                                            top_module)
+        self.module = pr.SubModule(module_path, self.start_pos, top_module)
         self.scope = self.module
         self.current = (None, None)
-        self.start_pos = 1, 0
-        self.end_pos = 1, 0
 
         source = source + '\n'  # end with \n, because the parser needs it
         buf = StringIO(source)
@@ -79,6 +77,10 @@ class Parser(object):
             # because of `self.module.used_names`.
             d.parent = self.module
 
+        if self.current[0] in (tokenize.NL, tokenize.NEWLINE):
+            # we added a newline before, so we need to "remove" it again.
+            self.end_pos = self._gen.previous[2]
+
         self.start_pos = self.module.start_pos
         self.module.end_pos = self.end_pos
         del self._gen
@@ -170,8 +172,6 @@ class Parser(object):
         while True:
             defunct = False
             token_type, tok = self.next()
-            if token_type == tokenize.ENDMARKER:
-                break
             if brackets and tok == '\n':
                 self.next()
             if tok == '(':  # python allows only one `(` in the statement.
@@ -421,8 +421,10 @@ class Parser(object):
     def __next__(self):
         """ Generate the next tokenize pattern. """
         try:
-            typ, tok, self.start_pos, self.end_pos, \
-                                self.parserline = next(self._gen)
+            typ, tok, start_pos, end_pos, self.parserline = next(self._gen)
+            # dedents shouldn't change positions
+            if typ != tokenize.DEDENT:
+                self.start_pos, self.end_pos = start_pos, end_pos
         except (StopIteration, common.MultiLevelStopIteration):
             # on finish, set end_pos correctly
             s = self.scope
@@ -662,7 +664,6 @@ class Parser(object):
                 self.freshscope = False
             else:
                 if token_type not in [tokenize.COMMENT, tokenize.INDENT,
-                                      tokenize.NEWLINE, tokenize.NL,
-                                      tokenize.ENDMARKER]:
+                                      tokenize.NEWLINE, tokenize.NL]:
                     debug.warning('token not classified', tok, token_type,
                                                         self.start_pos[0])