From a99d9541bd727b688ccd550f3731ec351f4f03d9 Mon Sep 17 00:00:00 2001 From: David Halter Date: Sun, 24 Mar 2013 22:51:17 +0430 Subject: [PATCH] on the way to a better fast_parser - improved a lot of the positioning stuff --- jedi/common.py | 77 ++++++++++++++++++++++++++++++++++++--------- jedi/fast_parser.py | 43 +++++++++++++++---------- jedi/parsing.py | 21 +++++++------ 3 files changed, 101 insertions(+), 40 deletions(-) diff --git a/jedi/common.py b/jedi/common.py index 35a10f62..cfa374d4 100644 --- a/jedi/common.py +++ b/jedi/common.py @@ -6,6 +6,8 @@ from _compatibility import next import debug import settings +FLOWS = ['if', 'else', 'elif', 'while', 'with', 'try', 'except', 'finally'] + class MultiLevelStopIteration(Exception): """ @@ -56,14 +58,21 @@ class PushBackIterator(object): class NoErrorTokenizer(object): - def __init__(self, readline, offset=(0, 0), stop_on_scope=False): + def __init__(self, readline, offset=(0, 0), is_fast_parser=False): self.readline = readline self.gen = PushBackIterator(tokenize.generate_tokens(readline)) self.offset = offset - self.stop_on_scope = stop_on_scope - self.first_scope = False self.closed = False - self.first = True + self.is_first = True + + # fast parser options + self.is_fast_parser = is_fast_parser + self.current = self.previous = [None, None, (0, 0), (0, 0), ''] + self.in_flow = False + self.new_indent = False + self.parser_indent = 0 + self.is_decorator = False + self.first_stmt = True def push_last_back(self): self.gen.push_back(self.current) @@ -76,6 +85,8 @@ class NoErrorTokenizer(object): if self.closed: raise MultiLevelStopIteration() try: + last_previous = self.previous + self.previous = self.current self.current = next(self.gen) except tokenize.TokenError: # We just ignore this error, I try to handle it earlier - as @@ -99,22 +110,60 @@ class NoErrorTokenizer(object): c = list(self.current) - # stop if a new class or definition is started at position zero. - breaks = ['def', 'class', '@'] - if self.stop_on_scope and c[1] in breaks and c[2][1] == 0: - if self.first_scope: - self.closed = True - raise MultiLevelStopIteration() - elif c[1] != '@': - self.first_scope = True + if c[0] == tokenize.ENDMARKER: + self.current = self.previous + self.previous = last_previous + raise MultiLevelStopIteration() - if self.first: + # this is exactly the same check as in fast_parser, but this time with + # tokenize and therefore precise. + breaks = ['def', 'class', '@'] + + if self.is_first: c[2] = self.offset[0] + c[2][0], self.offset[1] + c[2][1] c[3] = self.offset[0] + c[3][0], self.offset[1] + c[3][1] - self.first = False + self.is_first = False else: c[2] = self.offset[0] + c[2][0], c[2][1] c[3] = self.offset[0] + c[3][0], c[3][1] + print 'h', c, tokenize.tok_name[c[0]], self.current[2:4] + self.current = c + + def close(): + if not self.first_stmt: + self.closed = True + raise MultiLevelStopIteration() + # ignore indents/comments + if self.is_fast_parser \ + and self.previous[0] in (tokenize.INDENT, tokenize.NL, None, + tokenize.NEWLINE, tokenize.DEDENT) \ + and c[0] not in (tokenize.COMMENT, tokenize.INDENT, + tokenize.NL, tokenize.NEWLINE, tokenize.DEDENT): + print c, tokenize.tok_name[c[0]] + + tok = c[1] + indent = c[2][1] + if indent < self.parser_indent: # -> dedent + self.parser_indent = indent + self.new_indent = False + if not self.in_flow: + close() + self.in_flow = False + elif self.new_indent: + self.parser_indent = indent + + if not self.in_flow: + if tok in FLOWS or tok in breaks: + self.in_flow = tok in FLOWS + if not self.is_decorator and not self.in_flow: + close() + self.is_decorator = '@' == tok + if not self.is_decorator: + self.parser_indent += 1 # new scope: must be higher + self.new_indent = True + + if tok != '@': + self.first_stmt = False return c diff --git a/jedi/fast_parser.py b/jedi/fast_parser.py index e3b74c44..8811d290 100644 --- a/jedi/fast_parser.py +++ b/jedi/fast_parser.py @@ -11,6 +11,7 @@ import settings import parsing import parsing_representation as pr import cache +import common class Module(pr.Simple, pr.Module): @@ -219,17 +220,15 @@ class FastParser(use_metaclass(CachedFastParser)): parts.append(txt) current_lines[:] = [] - flows = ['if', 'else', 'elif', 'while', 'with', 'try', 'except', - 'finally'] - r_keyword = '^[ \t]*(def|class|@|%s)' % '|'.join(flows) + r_keyword = '^[ \t]*(def|class|@|%s)' % '|'.join(common.FLOWS) lines = code.splitlines() current_lines = [] parts = [] - is_generator = False + is_decorator = False current_indent = 0 new_indent = False - is_flow = False + in_flow = False # All things within flows are simply being ignored. for i, l in enumerate(lines): # check for dedents @@ -242,29 +241,35 @@ class FastParser(use_metaclass(CachedFastParser)): if indent < current_indent: # -> dedent current_indent = indent new_indent = False - if not is_flow: + if not in_flow: add_part() - is_flow = False + in_flow = False elif new_indent: current_indent = indent new_indent = False # Check lines for functions/classes and split the code there. - if not is_flow: + if not in_flow: m = re.match(r_keyword, l) if m: - is_flow = m.group(1) in flows - if not is_generator and not is_flow: + in_flow = m.group(1) in common.FLOWS + if not is_decorator and not in_flow: add_part() current_lines = [] - is_generator = '@' == m.group(1) - if not is_generator: + is_decorator = '@' == m.group(1) + if not is_decorator: current_indent += 1 # it must be higher new_indent = True current_lines.append(l) add_part() + for p in parts: + #print '#####################################' + #print p + #print len(p.splitlines()) + pass + return parts def _parse(self, code): @@ -280,11 +285,12 @@ class FastParser(use_metaclass(CachedFastParser)): el = module.imports[0] return el.start_pos[1] - if self.parsers: + if self.parsers and False: new_indent = get_indent(module) old_indent = get_indent(self.parsers[-1].module) if old_indent < new_indent: - module.parent = self.parsers[-1].module.subscopes[0] + #module.parent = self.parsers[-1].module.subscopes[0] + # TODO set parents + add to subscopes return p.module.parent = self.module @@ -301,7 +307,7 @@ class FastParser(use_metaclass(CachedFastParser)): p = None parser_order = 0 for code_part in parts: - lines = code_part.count('\n') + lines = code_part.count('\n') + 1 # the parser is using additional newlines, therefore substract if p is None or line_offset >= p.end_pos[0] - 2: # check if code_part has already been parsed @@ -336,8 +342,13 @@ class FastParser(use_metaclass(CachedFastParser)): parser_order += 1 line_offset += lines - start += len(code_part) + print line_offset + start += len(code_part) + 1 # +1 for newline self.parsers[parser_order + 1:] = [] + for p in self.parsers: + print(p.module.get_code()) + print(p.module.start_pos, p.module.end_pos) + exit() def reset_caches(self): self._user_scope = None diff --git a/jedi/parsing.py b/jedi/parsing.py index 06036193..0ce923be 100644 --- a/jedi/parsing.py +++ b/jedi/parsing.py @@ -52,13 +52,11 @@ class Parser(object): self.user_stmt = None self.no_docstr = no_docstr + self.start_pos = self.end_pos = 1 + offset[0], offset[1] # initialize global Scope - self.module = pr.SubModule(module_path, (offset[0] + 1, offset[1]), - top_module) + self.module = pr.SubModule(module_path, self.start_pos, top_module) self.scope = self.module self.current = (None, None) - self.start_pos = 1, 0 - self.end_pos = 1, 0 source = source + '\n' # end with \n, because the parser needs it buf = StringIO(source) @@ -79,6 +77,10 @@ class Parser(object): # because of `self.module.used_names`. d.parent = self.module + if self.current[0] in (tokenize.NL, tokenize.NEWLINE): + # we added a newline before, so we need to "remove" it again. + self.end_pos = self._gen.previous[2] + self.start_pos = self.module.start_pos self.module.end_pos = self.end_pos del self._gen @@ -170,8 +172,6 @@ class Parser(object): while True: defunct = False token_type, tok = self.next() - if token_type == tokenize.ENDMARKER: - break if brackets and tok == '\n': self.next() if tok == '(': # python allows only one `(` in the statement. @@ -421,8 +421,10 @@ class Parser(object): def __next__(self): """ Generate the next tokenize pattern. """ try: - typ, tok, self.start_pos, self.end_pos, \ - self.parserline = next(self._gen) + typ, tok, start_pos, end_pos, self.parserline = next(self._gen) + # dedents shouldn't change positions + if typ != tokenize.DEDENT: + self.start_pos, self.end_pos = start_pos, end_pos except (StopIteration, common.MultiLevelStopIteration): # on finish, set end_pos correctly s = self.scope @@ -662,7 +664,6 @@ class Parser(object): self.freshscope = False else: if token_type not in [tokenize.COMMENT, tokenize.INDENT, - tokenize.NEWLINE, tokenize.NL, - tokenize.ENDMARKER]: + tokenize.NEWLINE, tokenize.NL]: debug.warning('token not classified', tok, token_type, self.start_pos[0])