diff --git a/jedi/parser/__init__.py b/jedi/parser/__init__.py index f34b53fa..006d930c 100644 --- a/jedi/parser/__init__.py +++ b/jedi/parser/__init__.py @@ -1,4 +1,5 @@ -from jedi.parser.parser import Parser, ParserWithRecovery, ParserSyntaxError +from jedi.parser.parser import ParserSyntaxError +from jedi.parser.python.parser import Parser, ParserWithRecovery from jedi.parser.pgen2.pgen import generate_grammar from jedi.parser import python diff --git a/jedi/parser/parser.py b/jedi/parser/parser.py index aa086bee..28f0e8aa 100644 --- a/jedi/parser/parser.py +++ b/jedi/parser/parser.py @@ -15,14 +15,6 @@ within the statement. This lowers memory usage and cpu time and reduces the complexity of the ``Parser`` (there's another parser sitting inside ``Statement``, which produces ``Array`` and ``Call``). """ -import re - -from jedi.parser import tree as pt -from jedi.parser import tokenize -from jedi.parser.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER, - STRING, tok_name) -from jedi.parser.pgen2.parse import PgenParser - class ParserSyntaxError(Exception): """ @@ -35,285 +27,3 @@ class ParserSyntaxError(Exception): self.position = position -class Parser(object): - AST_MAPPING = { - 'expr_stmt': pt.ExprStmt, - 'classdef': pt.Class, - 'funcdef': pt.Function, - 'file_input': pt.Module, - 'import_name': pt.ImportName, - 'import_from': pt.ImportFrom, - 'break_stmt': pt.KeywordStatement, - 'continue_stmt': pt.KeywordStatement, - 'return_stmt': pt.ReturnStmt, - 'raise_stmt': pt.KeywordStatement, - 'yield_expr': pt.YieldExpr, - 'del_stmt': pt.KeywordStatement, - 'pass_stmt': pt.KeywordStatement, - 'global_stmt': pt.GlobalStmt, - 'nonlocal_stmt': pt.KeywordStatement, - 'print_stmt': pt.KeywordStatement, - 'assert_stmt': pt.AssertStmt, - 'if_stmt': pt.IfStmt, - 'with_stmt': pt.WithStmt, - 'for_stmt': pt.ForStmt, - 'while_stmt': pt.WhileStmt, - 'try_stmt': pt.TryStmt, - 'comp_for': pt.CompFor, - 'decorator': pt.Decorator, - 'lambdef': pt.Lambda, - 'old_lambdef': pt.Lambda, - 'lambdef_nocond': pt.Lambda, - } - - def __init__(self, grammar, source, start_symbol='file_input', - tokens=None, start_parsing=True): - # Todo Remove start_parsing (with False) - - self._used_names = {} - - self.source = source - self._added_newline = False - # The Python grammar needs a newline at the end of each statement. - if not source.endswith('\n') and start_symbol == 'file_input': - source += '\n' - self._added_newline = True - - self._start_symbol = start_symbol - self._grammar = grammar - - self._parsed = None - - if start_parsing: - if tokens is None: - tokens = tokenize.source_tokens(source, use_exact_op_types=True) - self.parse(tokens) - - def parse(self, tokens): - if self._parsed is not None: - return self._parsed - - start_number = self._grammar.symbol2number[self._start_symbol] - self.pgen_parser = PgenParser( - self._grammar, self.convert_node, self.convert_leaf, - self.error_recovery, start_number - ) - - self._parsed = self.pgen_parser.parse(tokens) - - if self._start_symbol == 'file_input' != self._parsed.type: - # If there's only one statement, we get back a non-module. That's - # not what we want, we want a module, so we add it here: - self._parsed = self.convert_node(self._grammar, - self._grammar.symbol2number['file_input'], - [self._parsed]) - - if self._added_newline: - self.remove_last_newline() - # The stack is empty now, we don't need it anymore. - del self.pgen_parser - return self._parsed - - def get_parsed_node(self): - # TODO remove in favor of get_root_node - return self._parsed - - def get_root_node(self): - return self._parsed - - def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix, - add_token_callback): - raise ParserSyntaxError('SyntaxError: invalid syntax', start_pos) - - def convert_node(self, grammar, type, children): - """ - Convert raw node information to a Node instance. - - This is passed to the parser driver which calls it whenever a reduction of a - grammar rule produces a new complete node, so that the tree is build - strictly bottom-up. - """ - symbol = grammar.number2symbol[type] - try: - return Parser.AST_MAPPING[symbol](children) - except KeyError: - if symbol == 'suite': - # We don't want the INDENT/DEDENT in our parser tree. Those - # leaves are just cancer. They are virtual leaves and not real - # ones and therefore have pseudo start/end positions and no - # prefixes. Just ignore them. - children = [children[0]] + children[2:-1] - return pt.Node(symbol, children) - - def convert_leaf(self, grammar, type, value, prefix, start_pos): - # print('leaf', repr(value), token.tok_name[type]) - if type == tokenize.NAME: - if value in grammar.keywords: - return pt.Keyword(value, start_pos, prefix) - else: - name = pt.Name(value, start_pos, prefix) - # Keep a listing of all used names - arr = self._used_names.setdefault(name.value, []) - arr.append(name) - return name - elif type == STRING: - return pt.String(value, start_pos, prefix) - elif type == NUMBER: - return pt.Number(value, start_pos, prefix) - elif type == NEWLINE: - return pt.Newline(value, start_pos, prefix) - elif type == ENDMARKER: - return pt.EndMarker(value, start_pos, prefix) - else: - return pt.Operator(value, start_pos, prefix) - - def remove_last_newline(self): - endmarker = self._parsed.children[-1] - # The newline is either in the endmarker as a prefix or the previous - # leaf as a newline token. - prefix = endmarker.prefix - if prefix.endswith('\n'): - endmarker.prefix = prefix = prefix[:-1] - last_end = 0 - if '\n' not in prefix: - # Basically if the last line doesn't end with a newline. we - # have to add the previous line's end_position. - previous_leaf = endmarker.get_previous_leaf() - if previous_leaf is not None: - last_end = previous_leaf.end_pos[1] - last_line = re.sub('.*\n', '', prefix) - endmarker.start_pos = endmarker.line - 1, last_end + len(last_line) - else: - newline = endmarker.get_previous_leaf() - if newline is None: - return # This means that the parser is empty. - - assert newline.value.endswith('\n') - newline.value = newline.value[:-1] - endmarker.start_pos = \ - newline.start_pos[0], newline.start_pos[1] + len(newline.value) - - -class ParserWithRecovery(Parser): - """ - This class is used to parse a Python file, it then divides them into a - class structure of different scopes. - - :param grammar: The grammar object of pgen2. Loaded by load_grammar. - :param source: The codebase for the parser. Must be unicode. - :param module_path: The path of the module in the file system, may be None. - :type module_path: str - """ - def __init__(self, grammar, source, module_path=None, tokens=None, - start_parsing=True): - self.syntax_errors = [] - - self._omit_dedent_list = [] - self._indent_counter = 0 - self._module_path = module_path - - # TODO do print absolute import detection here. - # try: - # del python_grammar_no_print_statement.keywords["print"] - # except KeyError: - # pass # Doesn't exist in the Python 3 grammar. - - # if self.options["print_function"]: - # python_grammar = pygram.python_grammar_no_print_statement - # else: - super(ParserWithRecovery, self).__init__( - grammar, source, - tokens=tokens, - start_parsing=start_parsing - ) - - def parse(self, tokenizer): - root_node = super(ParserWithRecovery, self).parse(self._tokenize(tokenizer)) - self.module = root_node - self.module.used_names = self._used_names - self.module.path = self._module_path - return root_node - - def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix, - add_token_callback): - """ - This parser is written in a dynamic way, meaning that this parser - allows using different grammars (even non-Python). However, error - recovery is purely written for Python. - """ - def current_suite(stack): - # For now just discard everything that is not a suite or - # file_input, if we detect an error. - for index, (dfa, state, (type_, nodes)) in reversed(list(enumerate(stack))): - # `suite` can sometimes be only simple_stmt, not stmt. - symbol = grammar.number2symbol[type_] - if symbol == 'file_input': - break - elif symbol == 'suite' and len(nodes) > 1: - # suites without an indent in them get discarded. - break - elif symbol == 'simple_stmt' and len(nodes) > 1: - # simple_stmt can just be turned into a Node, if there are - # enough statements. Ignore the rest after that. - break - return index, symbol, nodes - - index, symbol, nodes = current_suite(stack) - if symbol == 'simple_stmt': - index -= 2 - (_, _, (type_, suite_nodes)) = stack[index] - symbol = grammar.number2symbol[type_] - suite_nodes.append(pt.Node(symbol, list(nodes))) - # Remove - nodes[:] = [] - nodes = suite_nodes - stack[index] - - # print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index) - if self._stack_removal(grammar, stack, arcs, index + 1, value, start_pos): - add_token_callback(typ, value, start_pos, prefix) - else: - if typ == INDENT: - # For every deleted INDENT we have to delete a DEDENT as well. - # Otherwise the parser will get into trouble and DEDENT too early. - self._omit_dedent_list.append(self._indent_counter) - else: - error_leaf = pt.ErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix) - stack[-1][2][1].append(error_leaf) - - def _stack_removal(self, grammar, stack, arcs, start_index, value, start_pos): - failed_stack = [] - found = False - all_nodes = [] - for dfa, state, (typ, nodes) in stack[start_index:]: - if nodes: - found = True - if found: - symbol = grammar.number2symbol[typ] - failed_stack.append((symbol, nodes)) - all_nodes += nodes - if failed_stack: - stack[start_index - 1][2][1].append(pt.ErrorNode(all_nodes)) - - stack[start_index:] = [] - return failed_stack - - def _tokenize(self, tokenizer): - for typ, value, start_pos, prefix in tokenizer: - # print(tokenize.tok_name[typ], repr(value), start_pos, repr(prefix)) - if typ == DEDENT: - # We need to count indents, because if we just omit any DEDENT, - # we might omit them in the wrong place. - o = self._omit_dedent_list - if o and o[-1] == self._indent_counter: - o.pop() - continue - - self._indent_counter -= 1 - elif typ == INDENT: - self._indent_counter += 1 - - yield typ, value, start_pos, prefix - - def __repr__(self): - return "<%s: %s>" % (type(self).__name__, self.module) diff --git a/jedi/parser/python/__init__.py b/jedi/parser/python/__init__.py index ba03293f..12200dc7 100644 --- a/jedi/parser/python/__init__.py +++ b/jedi/parser/python/__init__.py @@ -5,7 +5,7 @@ import os from jedi._compatibility import FileNotFoundError from jedi.parser.pgen2.pgen import generate_grammar -from jedi.parser.parser import Parser, ParserWithRecovery +from jedi.parser.python.parser import Parser, ParserWithRecovery from jedi.parser.tokenize import source_tokens diff --git a/jedi/parser/python/parser.py b/jedi/parser/python/parser.py new file mode 100644 index 00000000..76a91758 --- /dev/null +++ b/jedi/parser/python/parser.py @@ -0,0 +1,292 @@ +import re + +from jedi.parser import tree as pt +from jedi.parser import tokenize +from jedi.parser.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER, + STRING, tok_name) +from jedi.parser.pgen2.parse import PgenParser +from jedi.parser.parser import ParserSyntaxError + + +class Parser(object): + AST_MAPPING = { + 'expr_stmt': pt.ExprStmt, + 'classdef': pt.Class, + 'funcdef': pt.Function, + 'file_input': pt.Module, + 'import_name': pt.ImportName, + 'import_from': pt.ImportFrom, + 'break_stmt': pt.KeywordStatement, + 'continue_stmt': pt.KeywordStatement, + 'return_stmt': pt.ReturnStmt, + 'raise_stmt': pt.KeywordStatement, + 'yield_expr': pt.YieldExpr, + 'del_stmt': pt.KeywordStatement, + 'pass_stmt': pt.KeywordStatement, + 'global_stmt': pt.GlobalStmt, + 'nonlocal_stmt': pt.KeywordStatement, + 'print_stmt': pt.KeywordStatement, + 'assert_stmt': pt.AssertStmt, + 'if_stmt': pt.IfStmt, + 'with_stmt': pt.WithStmt, + 'for_stmt': pt.ForStmt, + 'while_stmt': pt.WhileStmt, + 'try_stmt': pt.TryStmt, + 'comp_for': pt.CompFor, + 'decorator': pt.Decorator, + 'lambdef': pt.Lambda, + 'old_lambdef': pt.Lambda, + 'lambdef_nocond': pt.Lambda, + } + + def __init__(self, grammar, source, start_symbol='file_input', + tokens=None, start_parsing=True): + # Todo Remove start_parsing (with False) + + self._used_names = {} + + self.source = source + self._added_newline = False + # The Python grammar needs a newline at the end of each statement. + if not source.endswith('\n') and start_symbol == 'file_input': + source += '\n' + self._added_newline = True + + self._start_symbol = start_symbol + self._grammar = grammar + + self._parsed = None + + if start_parsing: + if tokens is None: + tokens = tokenize.source_tokens(source, use_exact_op_types=True) + self.parse(tokens) + + def parse(self, tokens): + if self._parsed is not None: + return self._parsed + + start_number = self._grammar.symbol2number[self._start_symbol] + self.pgen_parser = PgenParser( + self._grammar, self.convert_node, self.convert_leaf, + self.error_recovery, start_number + ) + + self._parsed = self.pgen_parser.parse(tokens) + + if self._start_symbol == 'file_input' != self._parsed.type: + # If there's only one statement, we get back a non-module. That's + # not what we want, we want a module, so we add it here: + self._parsed = self.convert_node(self._grammar, + self._grammar.symbol2number['file_input'], + [self._parsed]) + + if self._added_newline: + self.remove_last_newline() + # The stack is empty now, we don't need it anymore. + del self.pgen_parser + return self._parsed + + def get_parsed_node(self): + # TODO remove in favor of get_root_node + return self._parsed + + def get_root_node(self): + return self._parsed + + def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix, + add_token_callback): + raise ParserSyntaxError('SyntaxError: invalid syntax', start_pos) + + def convert_node(self, grammar, type, children): + """ + Convert raw node information to a Node instance. + + This is passed to the parser driver which calls it whenever a reduction of a + grammar rule produces a new complete node, so that the tree is build + strictly bottom-up. + """ + symbol = grammar.number2symbol[type] + try: + return Parser.AST_MAPPING[symbol](children) + except KeyError: + if symbol == 'suite': + # We don't want the INDENT/DEDENT in our parser tree. Those + # leaves are just cancer. They are virtual leaves and not real + # ones and therefore have pseudo start/end positions and no + # prefixes. Just ignore them. + children = [children[0]] + children[2:-1] + return pt.Node(symbol, children) + + def convert_leaf(self, grammar, type, value, prefix, start_pos): + # print('leaf', repr(value), token.tok_name[type]) + if type == tokenize.NAME: + if value in grammar.keywords: + return pt.Keyword(value, start_pos, prefix) + else: + name = pt.Name(value, start_pos, prefix) + # Keep a listing of all used names + arr = self._used_names.setdefault(name.value, []) + arr.append(name) + return name + elif type == STRING: + return pt.String(value, start_pos, prefix) + elif type == NUMBER: + return pt.Number(value, start_pos, prefix) + elif type == NEWLINE: + return pt.Newline(value, start_pos, prefix) + elif type == ENDMARKER: + return pt.EndMarker(value, start_pos, prefix) + else: + return pt.Operator(value, start_pos, prefix) + + def remove_last_newline(self): + endmarker = self._parsed.children[-1] + # The newline is either in the endmarker as a prefix or the previous + # leaf as a newline token. + prefix = endmarker.prefix + if prefix.endswith('\n'): + endmarker.prefix = prefix = prefix[:-1] + last_end = 0 + if '\n' not in prefix: + # Basically if the last line doesn't end with a newline. we + # have to add the previous line's end_position. + previous_leaf = endmarker.get_previous_leaf() + if previous_leaf is not None: + last_end = previous_leaf.end_pos[1] + last_line = re.sub('.*\n', '', prefix) + endmarker.start_pos = endmarker.line - 1, last_end + len(last_line) + else: + newline = endmarker.get_previous_leaf() + if newline is None: + return # This means that the parser is empty. + + assert newline.value.endswith('\n') + newline.value = newline.value[:-1] + endmarker.start_pos = \ + newline.start_pos[0], newline.start_pos[1] + len(newline.value) + + +class ParserWithRecovery(Parser): + """ + This class is used to parse a Python file, it then divides them into a + class structure of different scopes. + + :param grammar: The grammar object of pgen2. Loaded by load_grammar. + :param source: The codebase for the parser. Must be unicode. + :param module_path: The path of the module in the file system, may be None. + :type module_path: str + """ + def __init__(self, grammar, source, module_path=None, tokens=None, + start_parsing=True): + self.syntax_errors = [] + + self._omit_dedent_list = [] + self._indent_counter = 0 + self._module_path = module_path + + # TODO do print absolute import detection here. + # try: + # del python_grammar_no_print_statement.keywords["print"] + # except KeyError: + # pass # Doesn't exist in the Python 3 grammar. + + # if self.options["print_function"]: + # python_grammar = pygram.python_grammar_no_print_statement + # else: + super(ParserWithRecovery, self).__init__( + grammar, source, + tokens=tokens, + start_parsing=start_parsing + ) + + def parse(self, tokenizer): + root_node = super(ParserWithRecovery, self).parse(self._tokenize(tokenizer)) + self.module = root_node + self.module.used_names = self._used_names + self.module.path = self._module_path + return root_node + + def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix, + add_token_callback): + """ + This parser is written in a dynamic way, meaning that this parser + allows using different grammars (even non-Python). However, error + recovery is purely written for Python. + """ + def current_suite(stack): + # For now just discard everything that is not a suite or + # file_input, if we detect an error. + for index, (dfa, state, (type_, nodes)) in reversed(list(enumerate(stack))): + # `suite` can sometimes be only simple_stmt, not stmt. + symbol = grammar.number2symbol[type_] + if symbol == 'file_input': + break + elif symbol == 'suite' and len(nodes) > 1: + # suites without an indent in them get discarded. + break + elif symbol == 'simple_stmt' and len(nodes) > 1: + # simple_stmt can just be turned into a Node, if there are + # enough statements. Ignore the rest after that. + break + return index, symbol, nodes + + index, symbol, nodes = current_suite(stack) + if symbol == 'simple_stmt': + index -= 2 + (_, _, (type_, suite_nodes)) = stack[index] + symbol = grammar.number2symbol[type_] + suite_nodes.append(pt.Node(symbol, list(nodes))) + # Remove + nodes[:] = [] + nodes = suite_nodes + stack[index] + + # print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index) + if self._stack_removal(grammar, stack, arcs, index + 1, value, start_pos): + add_token_callback(typ, value, start_pos, prefix) + else: + if typ == INDENT: + # For every deleted INDENT we have to delete a DEDENT as well. + # Otherwise the parser will get into trouble and DEDENT too early. + self._omit_dedent_list.append(self._indent_counter) + else: + error_leaf = pt.ErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix) + stack[-1][2][1].append(error_leaf) + + def _stack_removal(self, grammar, stack, arcs, start_index, value, start_pos): + failed_stack = [] + found = False + all_nodes = [] + for dfa, state, (typ, nodes) in stack[start_index:]: + if nodes: + found = True + if found: + symbol = grammar.number2symbol[typ] + failed_stack.append((symbol, nodes)) + all_nodes += nodes + if failed_stack: + stack[start_index - 1][2][1].append(pt.ErrorNode(all_nodes)) + + stack[start_index:] = [] + return failed_stack + + def _tokenize(self, tokenizer): + for typ, value, start_pos, prefix in tokenizer: + # print(tokenize.tok_name[typ], repr(value), start_pos, repr(prefix)) + if typ == DEDENT: + # We need to count indents, because if we just omit any DEDENT, + # we might omit them in the wrong place. + o = self._omit_dedent_list + if o and o[-1] == self._indent_counter: + o.pop() + continue + + self._indent_counter -= 1 + elif typ == INDENT: + self._indent_counter += 1 + + yield typ, value, start_pos, prefix + + def __repr__(self): + return "<%s: %s>" % (type(self).__name__, self.module) diff --git a/test/test_evaluate/test_buildout_detection.py b/test/test_evaluate/test_buildout_detection.py index 58f8843e..85855308 100644 --- a/test/test_evaluate/test_buildout_detection.py +++ b/test/test_evaluate/test_buildout_detection.py @@ -9,7 +9,7 @@ from jedi.evaluate.sys_path import (_get_parent_dir_with_file, from jedi.evaluate import Evaluator from jedi.evaluate.representation import ModuleContext from jedi.parser.python import parse, load_grammar -from jedi.parser.parser import ParserWithRecovery +from jedi.parser.python.parser import ParserWithRecovery from ..helpers import cwd_at