From d4a10929e2b789a84a88c6cc60a6eae174a6291c Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Mon, 23 May 2016 18:11:44 +0200 Subject: [PATCH] Starting to create a way of how context sensitive completions can be made. This involves playing heavily with the parser pgen2. We use its stack to check for all possible tokens/keywords. --- jedi/api/__init__.py | 4 +- jedi/api/completion.py | 74 ++++++++++++++++++++++++++++------ jedi/api/helpers.py | 80 +++++++++++++++++++++++++++++++++++++ jedi/parser/__init__.py | 79 +++++++++++++++++++++++++----------- jedi/parser/fast.py | 2 +- jedi/parser/pgen2/parse.py | 24 ++++++----- jedi/parser/tokenize.py | 16 +++++--- jedi/parser/user_context.py | 44 ++++++++++++++++++-- 8 files changed, 264 insertions(+), 59 deletions(-) diff --git a/jedi/api/__init__.py b/jedi/api/__init__.py index 202f8622..df8d17e7 100644 --- a/jedi/api/__init__.py +++ b/jedi/api/__init__.py @@ -186,7 +186,7 @@ class Script(object): return new_defs goto_path = self._user_context.get_path_under_cursor() - context = self._user_context.get_context() + context = self._user_context.get_reverse_context() definitions = [] if next(context) in ('class', 'def'): definitions = [self._evaluator.wrap(self._parser.user_scope())] @@ -253,7 +253,7 @@ class Script(object): return definitions goto_path = self._user_context.get_path_under_cursor() - context = self._user_context.get_context() + context = self._user_context.get_reverse_context() user_stmt = self._parser.user_stmt() user_scope = self._parser.user_scope() diff --git a/jedi/api/completion.py b/jedi/api/completion.py index 265e8cb2..d86687c1 100644 --- a/jedi/api/completion.py +++ b/jedi/api/completion.py @@ -1,6 +1,7 @@ from itertools import chain import re +from jedi.parser import token from jedi.parser import tree from jedi import debug from jedi import settings @@ -70,7 +71,7 @@ class Completion: user_stmt = self._parser.user_stmt_with_whitespace() - completion_names = self.get_completions(user_stmt, completion_parts) + completion_names = self._get_context_completions(user_stmt, completion_parts) if not completion_parts.has_dot: call_signatures = self._call_signatures_method() @@ -85,30 +86,73 @@ class Completion: x.name.startswith('_'), x.name.lower())) - def get_completions(self, user_stmt, completion_parts): - # TODO this closure is ugly. it also doesn't work with - # simple_complete (used for Interpreter), somehow redo. + def _get_context_completions(self, user_stmt, completion_parts): + """ + Analyzes the context that a completion is made in and decides what to + return. + + Could provide context for: + - from/import completions + - as nothing + - statements that start always on new line + 'import', 'class', 'def', 'try', 'except', + 'finally', 'while', with + - statements that start always on new line or after ; or after : + return raise continue break del pass global nonlocal assert + - def/class nothing + - async for/def/with + - \n@/del/return/raise no keyword (after keyword no keyword)? + - after keyword + - continue/break/pass nothing + - global/nonlocal search global + - after operator no keyword: return + - yield like return + after ( and = + - almost always ok + 'and', 'for', 'if', 'else', 'in', 'is', 'lambda', 'not', 'or' + - after operations no keyword: + + = * ** - etc Maybe work with the parser state? + + # hard: + - await + - yield from / raise from / from import difference + - In args: */**: no completion + - In params (also lambda): no completion before = + """ module = self._evaluator.wrap(self._parser.module()) names, level, only_modules, unfinished_dotted = \ helpers.check_error_statements(module, self._pos) + + grammar = self._evaluator.grammar + stack = helpers.get_stack_at_position(grammar, module, self._pos) + allowed_keywords, allowed_tokens = \ + helpers.get_possible_completion_types(grammar, stack) + + completion_names = list(self._get_keyword_completion_names(allowed_keywords)) + if token.NAME in allowed_tokens: + # Differentiate between import names and other names. + completion_names += self._simple_complete(completion_parts) + completion_names = [] if names is not None: imp_names = tuple(str(n) for n in names if n.end_pos < self._pos) i = imports.Importer(self._evaluator, imp_names, module, level) completion_names = i.completion_names(self._evaluator, only_modules) + return completion_names + # TODO this paragraph is necessary, but not sure it works. - context = self._user_context.get_context() - if not next(context).startswith('.'): # skip the path - if next(context) == 'from': - # completion is just "import" if before stands from .. - if unfinished_dotted: - return completion_names - else: - return [keywords.keyword(self._evaluator, 'import').name] + context = self._user_context.get_backwards_context_tokens() + x = next(context, None) + #print(x) + #if not x.string.startswith('.'): # skip the path + if next(context, None).string == 'from': + # completion is just "import" if before stands from .. + if unfinished_dotted: + return completion_names + else: + return [keywords.keyword(self._evaluator, 'import').name] if isinstance(user_stmt, tree.Import): - module = self._parser.module() completion_names += imports.completion_names(self._evaluator, user_stmt, self._pos) return completion_names @@ -126,6 +170,10 @@ class Completion: completion_names += self._simple_complete(completion_parts) return completion_names + def _get_keyword_completion_names(self, keywords): + for keyword in keywords: + yield keywords.keyword(self._evaluator, keyword).name + def _simple_complete(self, completion_parts): if not completion_parts.path and not completion_parts.has_dot: scope = self._parser.user_scope() diff --git a/jedi/api/helpers.py b/jedi/api/helpers.py index 5549c93b..e42ff76d 100644 --- a/jedi/api/helpers.py +++ b/jedi/api/helpers.py @@ -4,8 +4,11 @@ Helpers for the API import re from collections import namedtuple +from jedi import common from jedi.parser import tree as pt from jedi.evaluate import imports +from jedi import parser +from jedi.parser import tokenize, token CompletionParts = namedtuple('CompletionParts', ['path', 'has_dot', 'name']) @@ -46,6 +49,83 @@ def check_error_statements(module, pos): return None, 0, False, False +def get_code_until(code, start_pos, end_pos): + lines = common.splitlines(code) + line_difference = end_pos[0] - start_pos[0] + if line_difference == 0: + end_line_length = end_pos[1] - start_pos[1] + else: + end_line_length = end_pos[1] + + if line_difference > len(lines) or end_line_length > len(lines[-1]): + raise ValueError("The end_pos seems to be after the code part.") + + new_lines = lines[:line_difference] + [lines[-1][:end_line_length]] + return '\n'.join(new_lines) + + +def get_stack_at_position(grammar, module, pos): + """ + Returns the possible node names (e.g. import_from, xor_test or yield_stmt). + """ + for error_statement in module.error_statement_stacks: + if error_statement.first_pos < pos <= error_statement.next_start_pos: + code = error_statement.get_code() + code = get_code_until(code, error_statement.first_pos, pos) + break + else: + raise NotImplementedError + + class EndMarkerReached(Exception): + pass + + def tokenize_without_endmarker(code): + for token_ in tokenize.source_tokens(code): + if token_[0] == token.ENDMARKER: + raise EndMarkerReached() + else: + yield token_ + + p = parser.Parser(grammar, code, tokenizer=tokenize_without_endmarker(code), + start_parsing=False) + try: + p.parse() + except EndMarkerReached: + return p.pgen_parser.stack + + +def get_possible_completion_types(grammar, stack): + def add_results(label_index): + try: + grammar_labels.append(inversed_tokens[label_index]) + except KeyError: + try: + keywords.append(inversed_keywords[label_index]) + except KeyError: + t, v = grammar.labels[label_index] + assert t >= 256 + # See if it's a symbol and if we're in its first set + inversed_keywords + itsdfa = grammar.dfas[t] + itsstates, itsfirst = itsdfa + for first_label_index in itsfirst.keys(): + add_results(first_label_index) + + dfa, state, node = stack[-1] + states, first = dfa + arcs = states[state] + + inversed_keywords = dict((v, k) for k, v in grammar.keywords.items()) + inversed_tokens = dict((v, k) for k, v in grammar.tokens.items()) + + keywords = [] + grammar_labels = [] + for label_index, new_state in arcs: + add_results(label_index) + + return keywords, grammar_labels + + def importer_from_error_statement(error_statement, pos): def check_dotted(children): for name in children[::2]: diff --git a/jedi/parser/__init__.py b/jedi/parser/__init__.py index 93be5f16..d00b587b 100644 --- a/jedi/parser/__init__.py +++ b/jedi/parser/__init__.py @@ -24,7 +24,7 @@ from jedi.parser import token from jedi.parser.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER, STRING, OP, ERRORTOKEN) from jedi.parser.pgen2.pgen import generate_grammar -from jedi.parser.pgen2.parse import PgenParser +from jedi.parser.pgen2.parse import PgenParser, token_to_ilabel OPERATOR_KEYWORDS = 'and', 'for', 'if', 'else', 'in', 'is', 'lambda', 'not', 'or' # Not used yet. In the future I intend to add something like KeywordStatement @@ -60,12 +60,20 @@ def load_grammar(version='3.4'): class ErrorStatement(object): - def __init__(self, stack, next_token, position_modifier, next_start_pos): + def __init__(self, stack, arcs, next_token, position_modifier, next_start_pos): self.stack = stack + self.arcs = arcs self._position_modifier = position_modifier self.next_token = next_token self._next_start_pos = next_start_pos + def __repr__(self): + return '<%s next: %s@%s>' % ( + type(self).__name__, + repr(self.next_token), + self.next_start_pos + ) + @property def next_start_pos(self): s = self._next_start_pos @@ -81,6 +89,16 @@ class ErrorStatement(object): first_type, nodes = self.stack[0] return first_type + def is_a_valid_token(self, type_, value): + ilabel = token_to_ilabel(type_, value) + for i, newstate in self.arcs: + if ilabel == i: + return True + return False + + def get_code(self): + return ''.join(node.get_code() for _, nodes in self.stack for node in nodes) + class ParserSyntaxError(object): def __init__(self, message, position): @@ -119,8 +137,10 @@ class Parser(object): 'lambdef_nocond': pt.Lambda, } - def __init__(self, grammar, source, start, tokenizer=None): - start_number = grammar.symbol2number[start] + def __init__(self, grammar, source, start_symbol='file_input', + tokenizer=None, start_parsing=True): + # Todo Remove start_parsing (with False) + start_number = grammar.symbol2number[start_symbol] self._used_names = {} self._scope_names_stack = [{}] @@ -131,27 +151,42 @@ class Parser(object): # For the fast parser. self.position_modifier = pt.PositionModifier() - added_newline = False + self._added_newline = False # The Python grammar needs a newline at the end of each statement. - if not source.endswith('\n') and start == 'file_input': + if not source.endswith('\n') and start_symbol == 'file_input': source += '\n' - added_newline = True + self._added_newline = True - p = PgenParser(grammar, self.convert_node, self.convert_leaf, - self.error_recovery, start_number) + self.pgen_parser = PgenParser( + grammar, self.convert_node, self.convert_leaf, + self.error_recovery, start_number + ) + + self._start_symbol = start_symbol + self._grammar = grammar + self._tokenizer = tokenizer if tokenizer is None: - tokenizer = tokenize.source_tokens(source) + self._tokenizer = tokenize.source_tokens(source, use_exact_op_types=True) - self._parsed = p.parse(self._tokenize(tokenizer)) + self._parsed = None - if start == 'file_input' != self._parsed.type: + if start_parsing: + self.parse() + + def parse(self): + if self._parsed is not None: + return self._parsed + + self._parsed = self.pgen_parser.parse(self._tokenize(self._tokenizer)) + + if self._start_symbol == 'file_input' != self._parsed.type: # If there's only one statement, we get back a non-module. That's # not what we want, we want a module, so we add it here: - self._parsed = self.convert_node(grammar, - grammar.symbol2number['file_input'], + self._parsed = self.convert_node(self._grammar, + self._grammar.symbol2number['file_input'], [self._parsed]) - if added_newline: + if self._added_newline: self.remove_last_newline() def get_parsed_node(self): @@ -161,8 +196,6 @@ class Parser(object): for typ, value, start_pos, prefix in tokenizer: if typ == ERRORTOKEN: raise ParseError - elif typ == OP: - typ = token.opmap[value] yield typ, value, prefix, start_pos def error_recovery(self, grammar, stack, typ, value, start_pos, prefix, @@ -301,7 +334,7 @@ class ParserWithRecovery(Parser): #if self.options["print_function"]: # python_grammar = pygram.python_grammar_no_print_statement #else: - super(ParserWithRecovery, self).__init__(grammar, source, 'file_input', tokenizer) + super(ParserWithRecovery, self).__init__(grammar, source, tokenizer=tokenizer) self.module = self._parsed self.module.used_names = self._used_names @@ -309,7 +342,7 @@ class ParserWithRecovery(Parser): self.module.global_names = self._global_names self.module.error_statement_stacks = self._error_statement_stacks - def error_recovery(self, grammar, stack, typ, value, start_pos, prefix, + def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix, add_token_callback): """ This parser is written in a dynamic way, meaning that this parser @@ -345,7 +378,7 @@ class ParserWithRecovery(Parser): stack[index] #print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index) - self._stack_removal(grammar, stack, index + 1, value, start_pos) + self._stack_removal(grammar, stack, arcs, index + 1, value, start_pos) if typ == INDENT: # For every deleted INDENT we have to delete a DEDENT as well. # Otherwise the parser will get into trouble and DEDENT too early. @@ -366,7 +399,7 @@ class ParserWithRecovery(Parser): # doesn't stop you from defining `continue` in a module, etc. add_token_callback(typ, value, prefix, start_pos) - def _stack_removal(self, grammar, stack, start_index, value, start_pos): + def _stack_removal(self, grammar, stack, arcs, start_index, value, start_pos): def clear_names(children): for c in children: try: @@ -393,7 +426,7 @@ class ParserWithRecovery(Parser): if nodes and nodes[0] in ('def', 'class', 'lambda'): self._scope_names_stack.pop() if failed_stack: - err = ErrorStatement(failed_stack, value, self.position_modifier, start_pos) + err = ErrorStatement(failed_stack, arcs, value, self.position_modifier, start_pos) self._error_statement_stacks.append(err) self._last_failed_start_pos = start_pos @@ -418,8 +451,6 @@ class ParserWithRecovery(Parser): self._add_syntax_error('Strange token', start_pos) continue - if typ == OP: - typ = token.opmap[value] yield typ, value, prefix, start_pos def _add_syntax_error(self, message, position): diff --git a/jedi/parser/fast.py b/jedi/parser/fast.py index f62f9a53..0c0041f9 100644 --- a/jedi/parser/fast.py +++ b/jedi/parser/fast.py @@ -451,7 +451,7 @@ class FastTokenizer(object): """ def __init__(self, source): self.source = source - self._gen = source_tokens(source) + self._gen = source_tokens(source, use_exact_op_types=True) self._closed = False # fast parser options diff --git a/jedi/parser/pgen2/parse.py b/jedi/parser/pgen2/parse.py index c6a9109b..dcd0fe0b 100644 --- a/jedi/parser/pgen2/parse.py +++ b/jedi/parser/pgen2/parse.py @@ -34,6 +34,18 @@ class InternalParseError(Exception): self.start_pos = start_pos +def token_to_ilabel(grammar, type_, value): + # Map from token to label + if type_ == tokenize.NAME: + # Check for reserved words (keywords) + try: + return grammar.keywords[value] + except KeyError: + pass + + return grammar.tokens[type_] + + class PgenParser(object): """Parser engine. @@ -118,15 +130,7 @@ class PgenParser(object): def addtoken(self, type_, value, prefix, start_pos): """Add a token; return True if this is the end of the program.""" - # Map from token to label - if type_ == tokenize.NAME: - # Check for reserved words (keywords) - try: - ilabel = self.grammar.keywords[value] - except KeyError: - ilabel = self.grammar.tokens[type_] - else: - ilabel = self.grammar.tokens[type_] + ilabel = token_to_ilabel(self.grammar, type_, value) # Loop until the token is shifted; may raise exceptions while True: @@ -168,7 +172,7 @@ class PgenParser(object): # Done parsing, but another token is input raise InternalParseError("too much input", type_, value, start_pos) else: - self.error_recovery(self.grammar, self.stack, type_, + self.error_recovery(self.grammar, self.stack, arcs, type_, value, start_pos, prefix, self.addtoken) break diff --git a/jedi/parser/tokenize.py b/jedi/parser/tokenize.py index 194a0967..a5c55bf5 100644 --- a/jedi/parser/tokenize.py +++ b/jedi/parser/tokenize.py @@ -14,7 +14,7 @@ from __future__ import absolute_import import string import re from io import StringIO -from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, +from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap, NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT) from jedi._compatibility import is_py3 @@ -143,18 +143,19 @@ del _compile tabsize = 8 +# TODO add with? ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', 'finally', 'while', 'return') -def source_tokens(source): +def source_tokens(source, use_exact_op_types=False): """Generate tokens from a the source code (string).""" source = source readline = StringIO(source).readline - return generate_tokens(readline) + return generate_tokens(readline, use_exact_op_types) -def generate_tokens(readline): +def generate_tokens(readline, use_exact_op_types=False): """ A heavily modified Python standard library tokenizer. @@ -285,7 +286,12 @@ def generate_tokens(readline): paren_level += 1 elif token in ')]}': paren_level -= 1 - yield OP, token, spos, prefix + + if use_exact_op_types: + typ = opmap[token] + else: + typ = OP + yield typ, token, spos, prefix if new_line: end_pos = lnum + 1, 0 diff --git a/jedi/parser/user_context.py b/jedi/parser/user_context.py index d62a383f..ae073b98 100644 --- a/jedi/parser/user_context.py +++ b/jedi/parser/user_context.py @@ -1,16 +1,20 @@ import re import os import keyword +from collections import namedtuple from jedi import cache from jedi import common from jedi.parser import tokenize, ParserWithRecovery from jedi._compatibility import u +from jedi.parser import token from jedi.parser.fast import FastParser from jedi.parser import tree from jedi import debug from jedi.common import PushBackIterator +# TODO this should be part of the tokenizer not just of this user_context. +Token = namedtuple('Token', ['type', 'string', 'start_pos', 'prefix']) REPLACE_STR = r"[bBuU]?[rR]?" + (r"(?:(')[^\n'\\]*(?:\\.[^\n'\\]*)*(?:'|$)" + '|' + @@ -66,7 +70,7 @@ class UserContext(object): first_line = common.splitlines(tok_str)[0] column -= len(first_line) # Reverse the token again, so that it is in normal order again. - yield typ, tok_str[::-1], (self._line_temp, column), prefix[::-1] + yield Token(typ, tok_str[::-1], (self._line_temp, column), prefix[::-1]) def _calc_path_until_cursor(self, start_pos): """ @@ -214,11 +218,14 @@ class UserContext(object): next_is_key = True return None, 0, None, (0, 0) - def get_context(self, yield_positions=False): + def get_reverse_context(self, yield_positions=False): + """ + Returns the token strings in reverse order from the start position. + """ self.get_path_until_cursor() # In case _start_cursor_pos is undefined. pos = self._start_cursor_pos while True: - # remove non important white space + # Remove non important white space. line = self.get_line(pos[0]) while True: if pos[1] == 0: @@ -246,6 +253,35 @@ class UserContext(object): else: yield '' + def get_backwards_context_tokens(self): + self.get_path_until_cursor() # In case _start_cursor_pos is undefined. + pos = self._start_cursor_pos + while True: + # Remove non important white space. + line = self.get_line(pos[0]) + while True: + if pos[1] == 0: + line = self.get_line(pos[0] - 1) + if line and line[-1] == '\\': + pos = pos[0] - 1, len(line) - 1 + continue + else: + break + + if line[pos[1] - 1].isspace(): + pos = pos[0], pos[1] - 1 + else: + break + + try: + token_ = next(self._get_backwards_tokenizer(pos)) + pos = token_.start_pos + yield token_ + except StopIteration: + # Make it clear that there's nothing coming anymore. + #yield Token('', token.ENDMARKER, (1, 0), '') + break + def get_line(self, line_nr): if not self._line_cache: self._line_cache = common.splitlines(self.source) @@ -310,7 +346,7 @@ class UserContextParser(object): # process it - probably a Syntax Error (or in a comment). debug.warning('No statement under the cursor.') return - pos = next(self._user_context.get_context(yield_positions=True)) + pos = next(self._user_context.get_reverse_context(yield_positions=True)) user_stmt = self.module().get_statement_for_position(pos) return user_stmt