""" Basically a parser that is faster, because it tries to parse only parts and if anything changes, it only reparses the changed parts. But because it's not finished (and still not working as I want), I won't document it any further. """ import re from itertools import chain from jedi._compatibility import use_metaclass, unicode from jedi import settings from jedi import common from jedi.parser import Parser from jedi.parser import tree as pr from jedi.parser import tokenize from jedi import cache from jedi.parser.tokenize import source_tokens, FLOWS, NEWLINE, COMMENT, ENDMARKER class FastModule(pr.Module, pr.Simple): type = 'file_input' def __init__(self, parsers): super(FastModule, self).__init__([]) self.parsers = parsers self.reset_caches() def reset_caches(self): """ This module does a whole lot of caching, because it uses different parsers. """ with common.ignored(AttributeError): del self._used_names def __getattr__(self, name): if name.startswith('__'): raise AttributeError('Not available!') else: return getattr(self.parsers[0].module, name) @property @cache.underscore_memoization def used_names(self): used_names = {} for p in self.parsers: for k, statement_set in p.module.used_names.items(): if k in used_names: used_names[k] |= statement_set else: used_names[k] = set(statement_set) return used_names def __repr__(self): return "" % (type(self).__name__, self.name, self.start_pos[0], self.end_pos[0]) class MergedNamesDict(object): def __init__(self, dicts): self._dicts = dicts def __getitem__(self, value): return list(chain.from_iterable(dct[value] for dct in self._dcts)) def values(self): lst = [] for dct in self._dicts: lst.append(dct.values()) return lst class CachedFastParser(type): """ This is a metaclass for caching `FastParser`. """ def __call__(self, grammar, source, module_path=None): if not settings.fast_parser: return Parser(grammar, source, module_path) pi = cache.parser_cache.get(module_path, None) if pi is None or isinstance(pi.parser, Parser): p = super(CachedFastParser, self).__call__(grammar, source, module_path) else: p = pi.parser # pi is a `cache.ParserCacheItem` p.update(source) return p class ParserNode(object): def __init__(self, fast_module, parser, code, parent=None): self._fast_module = fast_module self.parent = parent self.parser_children = [] # must be created before new things are added to it. self.save_contents(parser, code) def save_contents(self, parser, code): print('SAVE') self.code = code self.hash = hash(code) self.parser = parser try: # With fast_parser we have either 1 subscope or only statements. self.content_scope = parser.module.subscopes[0] except IndexError: self.content_scope = self._fast_module """ scope = self.content_scope self._contents = {} for c in pr.SCOPE_CONTENTS: self._contents[c] = list(getattr(scope, c)) self._is_generator = scope.is_generator """ self.old_children = self.parser_children self.parser_children = [] def reset_contents(self): """ scope = self.content_scope for key, c in self._contents.items(): setattr(scope, key, list(c)) scope.is_generator = self._is_generator """ """ if self.parent is None: # Global vars of the first one can be deleted, in the global scope # they make no sense. self.parser.module.global_vars = [] """ for c in self.parser_children: c.reset_contents() def parent_until_indent(self, indent=None): if indent is None or self.indent >= indent and self.parent: self.old_children = [] if self.parent is not None: return self.parent.parent_until_indent(indent) return self @property def indent(self): if not self.parent: return 0 module = self.parser.module try: el = module.subscopes[0] except IndexError: try: el = module.statements[0] except IndexError: try: el = module.imports[0] except IndexError: try: el = [r for r in module.returns if r is not None][0] except IndexError: return self.parent.indent + 1 return el.start_pos[1] def _set_items(self, parser, set_parent=False): # insert parser objects into current structure scope = self.content_scope if set_parent: for child in parser.module.children: child.parent = scope scope.children.append(child) print('\t\t', scope, child) """ if isinstance(i, (pr.Function, pr.Class)): for d in i.decorators: d.parent = scope """ # TODO global_vars ? is_generator ? """ cur = self while cur.parent is not None: cur = cur.parent cur.parser.module.global_vars += parser.module.global_vars scope.is_generator |= parser.module.is_generator """ def add_node(self, node, set_parent=False): """Adding a node means adding a node that was already added earlier""" print('ADD') self.parser_children.append(node) self._set_items(node.parser, set_parent=set_parent) node.old_children = node.parser_children # TODO potential memory leak? node.parser_children = [] """ scope = self.content_scope while scope is not None: #print('x',scope) if not isinstance(scope, pr.SubModule): # TODO This seems like a strange thing. Check again. scope.end_pos = node.content_scope.end_pos scope = scope.parent """ return node def add_parser(self, parser, code): print('add parser') return self.add_node(ParserNode(self._fast_module, parser, code, self), True) class FastParser(use_metaclass(CachedFastParser)): _keyword_re = re.compile('^[ \t]*(def|class|@|%s)' % '|'.join(tokenize.FLOWS)) def __init__(self, grammar, code, module_path=None): # set values like `pr.Module`. self._grammar = grammar self.module_path = module_path print(module_path) self.current_node = None self.parsers = [] self.module = FastModule(self.parsers) self.reset_caches() try: self._parse(code) except: # FastParser is cached, be careful with exceptions del self.parsers[:] raise def update(self, code): self.reset_caches() try: self._parse(code) except: # FastParser is cached, be careful with exceptions del self.parsers[:] raise def _split_parts(self, code): """ Split the code into different parts. This makes it possible to parse each part seperately and therefore cache parts of the file and not everything. """ def gen_part(): text = '\n'.join(current_lines) del current_lines[:] return text # Split only new lines. Distinction between \r\n is the tokenizer's # job. self._lines = code.split('\n') current_lines = [] is_decorator = False current_indent = 0 old_indent = 0 new_indent = False in_flow = False # All things within flows are simply being ignored. for l in self._lines: # check for dedents s = l.lstrip('\t ') indent = len(l) - len(s) if not s or s[0] in ('#', '\r'): current_lines.append(l) # just ignore comments and blank lines continue if indent < current_indent: # -> dedent current_indent = indent new_indent = False if not in_flow or indent < old_indent: if current_lines: yield gen_part() in_flow = False elif new_indent: current_indent = indent new_indent = False # Check lines for functions/classes and split the code there. if not in_flow: m = self._keyword_re.match(l) if m: in_flow = m.group(1) in tokenize.FLOWS if not is_decorator and not in_flow: if current_lines: yield gen_part() is_decorator = '@' == m.group(1) if not is_decorator: old_indent = current_indent current_indent += 1 # it must be higher new_indent = True elif is_decorator: is_decorator = False current_lines.append(l) if current_lines: yield gen_part() def _parse(self, code): """ :type code: str """ def empty_parser(): new, temp = self._get_parser(unicode(''), unicode(''), 0, [], False) return new del self.parsers[:] line_offset = 0 start = 0 p = None is_first = True for code_part in self._split_parts(code): print(repr(code_part)) if is_first or line_offset >= p.module.end_pos[0]: indent = len(code_part) - len(code_part.lstrip('\t ')) if is_first and self.current_node is not None: nodes = [self.current_node] else: nodes = [] if self.current_node is not None: self.current_node = \ self.current_node.parent_until_indent(indent) nodes += self.current_node.old_children # check if code_part has already been parsed # print '#'*45,line_offset, p and p.module.end_pos, '\n', code_part p, node = self._get_parser(code_part, code[start:], line_offset, nodes, not is_first) # The actual used code_part is different from the given code # part, because of docstrings for example there's a chance that # splits are wrong. used_lines = self._lines[line_offset:p.module.end_pos[0]] code_part_actually_used = '\n'.join(used_lines) if is_first and p.module.subscopes: print('NOXXXX') # special case, we cannot use a function subscope as a # base scope, subscopes would save all the other contents new = empty_parser() if self.current_node is None: self.current_node = ParserNode(self.module, new, '') else: self.current_node.save_contents(new, '') self.parsers.append(new) is_first = False if is_first: if self.current_node is None: self.current_node = ParserNode(self.module, p, code_part_actually_used) else: self.current_node.save_contents(p, code_part_actually_used) else: if node is None: self.current_node = \ self.current_node.add_parser(p, code_part_actually_used) else: self.current_node = self.current_node.add_node(node) self.parsers.append(p) is_first = False #else: #print '#'*45, line_offset, p.module.end_pos, 'theheck\n', repr(code_part) line_offset += code_part.count('\n') + 1 start += len(code_part) + 1 # +1 for newline if self.parsers: self.current_node = self.current_node.parent_until_indent() else: self.parsers.append(empty_parser()) """ TODO used? self.module.end_pos = self.parsers[-1].module.end_pos """ # print(self.parsers[0].module.get_code()) def _get_parser(self, code, parser_code, line_offset, nodes, no_docstr): h = hash(code) for index, node in enumerate(nodes): if node.hash == h and node.code == code: if node != self.current_node: offset = int(nodes[0] == self.current_node) self.current_node.old_children.pop(index - offset) p = node.parser m = p.module m.line_offset += line_offset + 1 - m.start_pos[0] break else: tokenizer = FastTokenizer(parser_code, line_offset) p = Parser(self._grammar, parser_code, self.module_path, tokenizer=tokenizer) #p.module.parent = self.module # With the new parser this is not # necessary anymore? node = None return p, node def reset_caches(self): self.module.reset_caches() if self.current_node is not None: self.current_node.reset_contents() class FastTokenizer(object): """ Breaks when certain conditions are met, i.e. a new function or class opens. """ def __init__(self, source, line_offset=0): self.source = source self.gen = source_tokens(source, line_offset) self.closed = False # fast parser options self.current = self.previous = None, '', (0, 0) self.in_flow = False self.new_indent = False self.parser_indent = self.old_parser_indent = 0 self.is_decorator = False self.first_stmt = True self._add_end_marker = False self.parentheses_level = 0 def __iter__(self): return self def next(self): """ Python 2 Compatibility """ return self.__next__() def __next__(self): if self.closed: if self._add_end_marker: self._add_end_marker = False start_pos = self.current[2] return tokenize.ENDMARKER, '', start_pos, '' raise StopIteration typ, value, start_pos, prefix = current = next(self.gen) if typ == ENDMARKER: self.closed = True self.previous = self.current self.current = current # this is exactly the same check as in fast_parser, but this time with # tokenize and therefore precise. breaks = ['def', 'class', '@'] def close(): if not self.first_stmt: self._add_end_marker = True self.closed = True # Ignore comments/newlines, irrelevant for indentation. if self.previous[0] in (None, NEWLINE) \ and typ not in (COMMENT, NEWLINE): # print c, tok_name[c[0]] indent = start_pos[1] if self.parentheses_level: # parentheses ignore the indentation rules. pass elif indent < self.parser_indent: # -> dedent self.parser_indent = indent self.new_indent = False if not self.in_flow or indent < self.old_parser_indent: close() self.in_flow = False elif self.new_indent: self.parser_indent = indent self.new_indent = False if not self.in_flow: if value in FLOWS or value in breaks: self.in_flow = value in FLOWS if not self.is_decorator and not self.in_flow: close() self.is_decorator = '@' == value if not self.is_decorator: self.old_parser_indent = self.parser_indent self.parser_indent += 1 # new scope: must be higher self.new_indent = True if value != '@': if self.first_stmt and not self.new_indent: self.parser_indent = indent self.first_stmt = False # Ignore closing parentheses, because they are all # irrelevant for the indentation. if value in '([{': self.parentheses_level += 1 elif value in ')]}': self.parentheses_level = max(self.parentheses_level - 1, 0) return current