""" The ``Parser`` tries to convert the available Python code in an easy to read format, something like an abstract syntax tree. The classes who represent this tree, are sitting in the :mod:`jedi.parser.tree` module. The Python module ``tokenize`` is a very important part in the ``Parser``, because it splits the code into different words (tokens). Sometimes it looks a bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast`` module for this? Well, ``ast`` does a very good job understanding proper Python code, but fails to work as soon as there's a single line of broken code. There's one important optimization that needs to be known: Statements are not being parsed completely. ``Statement`` is just a representation of the tokens within the statement. This lowers memory usage and cpu time and reduces the complexity of the ``Parser`` (there's another parser sitting inside ``Statement``, which produces ``Array`` and ``Call``). """ import logging import os from jedi._compatibility import next from jedi import common from jedi.parser import tree as pt from jedi.parser import tokenize from jedi.parser import pgen2 OPERATOR_KEYWORDS = 'and', 'for', 'if', 'else', 'in', 'is', 'lambda', 'not', 'or' # Not used yet. In the future I intend to add something like KeywordStatement STATEMENT_KEYWORDS = 'assert', 'del', 'global', 'nonlocal', 'raise', \ 'return', 'yield', 'pass', 'continue', 'break' _loaded_grammars = {} def load_grammar(file='grammar3.4'): global _loaded_grammars path = os.path.join(os.path.dirname(__file__), file) + '.txt' try: return _loaded_grammars[path] except KeyError: return _loaded_grammars.setdefault(path, pgen2.load_grammar(path)) class ErrorStatement(object): def __init__(self, stack, next_token, next_start_pos): self.stack = stack self.next_token = next_token self.next_start_pos = next_start_pos @property def first_pos(self): first_type, nodes = self.stack[0] return nodes[0].start_pos @property def first_type(self): first_type, nodes = self.stack[0] return first_type class Parser(object): """ This class is used to parse a Python file, it then divides them into a class structure of different scopes. :param grammar: The grammar object of pgen2. Loaded by load_grammar. :param source: The codebase for the parser. Must be unicode. :param module_path: The path of the module in the file system, may be None. :type module_path: str :param top_module: Use this module as a parent instead of `self.module`. """ def __init__(self, grammar, source, module_path=None, tokenizer=None): """ This is the way I imagine a parser describing the init function """ if not source.endswith('\n'): source += '\n' self._ast_mapping = { 'expr_stmt': pt.ExprStmt, 'classdef': pt.Class, 'funcdef': pt.Function, 'file_input': pt.SubModule, 'import_name': pt.ImportName, 'import_from': pt.ImportFrom, 'break_stmt': pt.KeywordStatement, 'continue_stmt': pt.KeywordStatement, 'return_stmt': pt.ReturnStmt, 'raise_stmt': pt.KeywordStatement, 'yield_expr': pt.YieldExpr, 'del_stmt': pt.KeywordStatement, 'pass_stmt': pt.KeywordStatement, 'global_stmt': pt.GlobalStmt, 'nonlocal_stmt': pt.KeywordStatement, 'assert_stmt': pt.KeywordStatement, 'if_stmt': pt.IfStmt, 'with_stmt': pt.WithStmt, 'for_stmt': pt.ForStmt, 'while_stmt': pt.WhileStmt, 'try_stmt': pt.TryStmt, 'comp_for': pt.CompFor, 'decorator': pt.Decorator, } self.global_names = [] # TODO do print absolute import detection here. #try: # del python_grammar_no_print_statement.keywords["print"] #except KeyError: # pass # Doesn't exist in the Python 3 grammar. #if self.options["print_function"]: # python_grammar = pygram.python_grammar_no_print_statement #else: # When this is True, the refactor*() methods will call write_file() for # files processed even if they were not changed during refactoring. If # and only if the refactor method's write parameter was True. self.used_names = {} self.scope_names_stack = [{}] self.error_statement_stacks = [] logger = logging.getLogger("Jedi-Parser") if False: d = pgen2.Driver(grammar, self.convert_node, self.convert_leaf, self.error_recovery, logger=logger) self.module = d.parse_string(source).get_parent_until() else: p = pgen2.parse.Parser(grammar, self.convert_node, self.convert_leaf, self.error_recovery) tokenizer = tokenizer or tokenize.source_tokens(source) self.module = p.parse(p.tokenize(self._tokenize(tokenizer))) self.module.used_names = self.used_names self.module.path = module_path self.module.set_global_names(self.global_names) self.module.error_statement_stacks = self.error_statement_stacks self.grammar_symbols = grammar.number2symbol def convert_node(self, grammar, type, children): """ Convert raw node information to a Node instance. This is passed to the parser driver which calls it whenever a reduction of a grammar rule produces a new complete node, so that the tree is build strictly bottom-up. """ symbol = grammar.number2symbol[type] try: new_node = self._ast_mapping[symbol](children) except KeyError: new_node = pt.Node(symbol, children) # We need to check raw_node always, because the same node can be # returned by convert multiple times. if symbol == 'global_stmt': self.global_names += new_node.get_defined_names() elif isinstance(new_node, (pt.ClassOrFunc, pt.Module)) \ and symbol in ('funcdef', 'classdef', 'file_input'): # scope_name_stack handling scope_names = self.scope_names_stack.pop() if isinstance(new_node, pt.ClassOrFunc): n = new_node.name scope_names[n.value].remove(n) # Set the func name of the current node arr = self.scope_names_stack[-1].setdefault(n.value, []) arr.append(n) new_node.names_dict = scope_names elif isinstance(new_node, pt.CompFor): # The name definitions of comprehenions shouldn't be part of the # current scope. They are part of the comprehension scope. for n in new_node.get_defined_names(): self.scope_names_stack[-1][n.value].remove(n) return new_node def convert_leaf(self, grammar, type, value, prefix, start_pos): #print('leaf', value, pytree.type_repr(type)) if type == tokenize.NAME: if value in grammar.keywords: if value in ('def', 'class'): self.scope_names_stack.append({}) return pt.Keyword(value, start_pos, prefix) else: name = pt.Name(value, start_pos, prefix) # Keep a listing of all used names arr = self.used_names.setdefault(name.value, []) arr.append(name) arr = self.scope_names_stack[-1].setdefault(name.value, []) arr.append(name) return name elif type == tokenize.STRING: return pt.String(value, start_pos, prefix) elif type == tokenize.NUMBER: return pt.Number(value, start_pos, prefix) elif type in (tokenize.NEWLINE, tokenize.ENDMARKER): return pt.Whitespace(value, start_pos, prefix) else: return pt.Operator(value, start_pos, prefix) def error_recovery(self, grammar, stack, typ, value, start_pos, prefix, add_token_callback): """ This parser is written in a dynamic way, meaning that this parser allows using different grammars (even non-Python). However, error recovery is purely written for Python. """ def current_suite(stack): # For now just discard everything that is not a suite or # file_input, if we detect an error. for index, (dfa, state, (typ, nodes)) in reversed(list(enumerate(stack))): # `suite` can sometimes be only simple_stmt, not stmt. symbol = grammar.number2symbol[typ] if symbol in ('file_input', 'suite'): break return index, symbol, nodes index, symbol, nodes = current_suite(stack) #print('err', tokenize.tok_name[typ], repr(value), start_pos, len(stack), index) self._stack_removal(grammar, stack, index + 1, value, start_pos) if value in ('import', 'from', 'class', 'def', 'try', 'while', 'return'): # Those can always be new statements. add_token_callback(typ, value, prefix, start_pos) elif typ == tokenize.DEDENT: if symbol == 'suite': # If a function or anything else contains a suite that is # "empty" (just NEWLINE/INDENT), we remove it. If it's not # empty, we can close it. if len(nodes) > 2: # Finish the suite. add_token_callback(typ, value, prefix, start_pos) else: # Remove the current suite from the stack to look deeper in # the stack for a suite/file_input. index, symbol, nodes = current_suite(stack[:-1]) self._stack_removal(grammar, stack, index + 1, value, start_pos) def _stack_removal(self, grammar, stack, start_index, value, start_pos): def clear_names(children): for c in children: try: clear_names(c.children) except AttributeError: if isinstance(c, pt.Name): try: self.scope_names_stack[-1][c.value].remove(c) self.used_names[c.value].remove(c) except ValueError: pass # This may happen with CompFor. for dfa, state, node in stack[start_index:]: clear_names(children=node[1]) failed_stack = [] found = False for dfa, state, (typ, nodes) in stack[start_index:]: if nodes: found = True if found: symbol = grammar.number2symbol[typ] failed_stack.append((symbol, nodes)) if nodes and nodes[0] in ('def', 'class'): self.scope_names_stack.pop() if failed_stack: err = ErrorStatement(failed_stack, value, start_pos) self.error_statement_stacks.append(err) stack[start_index:] = [] def _tokenize(self, tokenizer): """ while first_pos[1] <= self._scope.start_pos[1] \ and (token_type == tokenize.NAME or tok_str in ('(', '['))\ and self._scope != self.module: self._scope.end_pos = first_pos self._scope = self._scope.parent if isinstance(self._scope, pr.Module) \ and not isinstance(self._scope, pr.SubModule): self._scope = self.module """ new_scope = False for token in tokenizer: typ = token.type value = token.value if typ == tokenize.OP: typ = pgen2.grammar.opmap[value] yield typ, value, token.prefix, token.start_pos def __init__old__(self, source, module_path=None, no_docstr=False, tokenizer=None, top_module=None): """ TODO REMOVE THIS """ self.no_docstr = no_docstr tokenizer = tokenizer or tokenize.source_tokens(source) self._gen = PushBackTokenizer(tokenizer) # initialize global Scope start_pos = next(self._gen).start_pos self._gen.push_last_back() self.module = pt.SubModule(module_path, start_pos, top_module) self._scope = self.module self._top_module = top_module or self.module try: self._parse() except (common.MultiLevelStopIteration, StopIteration): # StopIteration needs to be added as well, because python 2 has a # strange way of handling StopIterations. # sometimes StopIteration isn't catched. Just ignore it. # on finish, set end_pos correctly pass s = self._scope while s is not None: s.end_pos = self._gen.current.end_pos s = s.parent # clean up unused decorators for d in self._decorators: # set a parent for unused decorators, avoid NullPointerException # because of `self.module.used_names`. d.parent = self.module self.module.end_pos = self._gen.current.end_pos if self._gen.current.type == tokenize.NEWLINE: # This case is only relevant with the FastTokenizer, because # otherwise there's always an ENDMARKER. # we added a newline before, so we need to "remove" it again. # # NOTE: It should be keep end_pos as-is if the last token of # a source is a NEWLINE, otherwise the newline at the end of # a source is not included in a ParserNode.code. if self._gen.previous.type != tokenize.NEWLINE: self.module.end_pos = self._gen.previous.end_pos del self._gen def __repr__(self): return "<%s: %s>" % (type(self).__name__, self.module) def _check_user_stmt(self, simple): # this is not user checking, just update the used_names for tok_name in self.module.temp_used_names: try: self.module.used_names[tok_name].add(simple) except KeyError: self.module.used_names[tok_name] = set([simple]) self.module.temp_used_names = [] if isinstance(simple, pt.Statement): for name, calls in simple.get_names_dict().items(): self._scope.add_name_calls(name, calls) class PushBackTokenizer(object): def __init__(self, tokenizer): self._tokenizer = tokenizer self._push_backs = [] self.current = self.previous = tokenize.Token(None, '', (0, 0)) def push_last_back(self): self._push_backs.append(self.current) def next(self): """ Python 2 Compatibility """ return self.__next__() def __next__(self): if self._push_backs: return self._push_backs.pop(0) previous = self.current self.current = next(self._tokenizer) self.previous = previous return self.current def __iter__(self): return self