Starting to create a way of how context sensitive completions can be made.

This involves playing heavily with the parser pgen2. We use its stack to check for all possible tokens/keywords.
This commit is contained in:
Dave Halter
2016-05-23 18:11:44 +02:00
parent 36a135c347
commit d4a10929e2
8 changed files with 264 additions and 59 deletions

View File

@@ -24,7 +24,7 @@ from jedi.parser import token
from jedi.parser.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER,
STRING, OP, ERRORTOKEN)
from jedi.parser.pgen2.pgen import generate_grammar
from jedi.parser.pgen2.parse import PgenParser
from jedi.parser.pgen2.parse import PgenParser, token_to_ilabel
OPERATOR_KEYWORDS = 'and', 'for', 'if', 'else', 'in', 'is', 'lambda', 'not', 'or'
# Not used yet. In the future I intend to add something like KeywordStatement
@@ -60,12 +60,20 @@ def load_grammar(version='3.4'):
class ErrorStatement(object):
def __init__(self, stack, next_token, position_modifier, next_start_pos):
def __init__(self, stack, arcs, next_token, position_modifier, next_start_pos):
self.stack = stack
self.arcs = arcs
self._position_modifier = position_modifier
self.next_token = next_token
self._next_start_pos = next_start_pos
def __repr__(self):
return '<%s next: %s@%s>' % (
type(self).__name__,
repr(self.next_token),
self.next_start_pos
)
@property
def next_start_pos(self):
s = self._next_start_pos
@@ -81,6 +89,16 @@ class ErrorStatement(object):
first_type, nodes = self.stack[0]
return first_type
def is_a_valid_token(self, type_, value):
ilabel = token_to_ilabel(type_, value)
for i, newstate in self.arcs:
if ilabel == i:
return True
return False
def get_code(self):
return ''.join(node.get_code() for _, nodes in self.stack for node in nodes)
class ParserSyntaxError(object):
def __init__(self, message, position):
@@ -119,8 +137,10 @@ class Parser(object):
'lambdef_nocond': pt.Lambda,
}
def __init__(self, grammar, source, start, tokenizer=None):
start_number = grammar.symbol2number[start]
def __init__(self, grammar, source, start_symbol='file_input',
tokenizer=None, start_parsing=True):
# Todo Remove start_parsing (with False)
start_number = grammar.symbol2number[start_symbol]
self._used_names = {}
self._scope_names_stack = [{}]
@@ -131,27 +151,42 @@ class Parser(object):
# For the fast parser.
self.position_modifier = pt.PositionModifier()
added_newline = False
self._added_newline = False
# The Python grammar needs a newline at the end of each statement.
if not source.endswith('\n') and start == 'file_input':
if not source.endswith('\n') and start_symbol == 'file_input':
source += '\n'
added_newline = True
self._added_newline = True
p = PgenParser(grammar, self.convert_node, self.convert_leaf,
self.error_recovery, start_number)
self.pgen_parser = PgenParser(
grammar, self.convert_node, self.convert_leaf,
self.error_recovery, start_number
)
self._start_symbol = start_symbol
self._grammar = grammar
self._tokenizer = tokenizer
if tokenizer is None:
tokenizer = tokenize.source_tokens(source)
self._tokenizer = tokenize.source_tokens(source, use_exact_op_types=True)
self._parsed = p.parse(self._tokenize(tokenizer))
self._parsed = None
if start == 'file_input' != self._parsed.type:
if start_parsing:
self.parse()
def parse(self):
if self._parsed is not None:
return self._parsed
self._parsed = self.pgen_parser.parse(self._tokenize(self._tokenizer))
if self._start_symbol == 'file_input' != self._parsed.type:
# If there's only one statement, we get back a non-module. That's
# not what we want, we want a module, so we add it here:
self._parsed = self.convert_node(grammar,
grammar.symbol2number['file_input'],
self._parsed = self.convert_node(self._grammar,
self._grammar.symbol2number['file_input'],
[self._parsed])
if added_newline:
if self._added_newline:
self.remove_last_newline()
def get_parsed_node(self):
@@ -161,8 +196,6 @@ class Parser(object):
for typ, value, start_pos, prefix in tokenizer:
if typ == ERRORTOKEN:
raise ParseError
elif typ == OP:
typ = token.opmap[value]
yield typ, value, prefix, start_pos
def error_recovery(self, grammar, stack, typ, value, start_pos, prefix,
@@ -301,7 +334,7 @@ class ParserWithRecovery(Parser):
#if self.options["print_function"]:
# python_grammar = pygram.python_grammar_no_print_statement
#else:
super(ParserWithRecovery, self).__init__(grammar, source, 'file_input', tokenizer)
super(ParserWithRecovery, self).__init__(grammar, source, tokenizer=tokenizer)
self.module = self._parsed
self.module.used_names = self._used_names
@@ -309,7 +342,7 @@ class ParserWithRecovery(Parser):
self.module.global_names = self._global_names
self.module.error_statement_stacks = self._error_statement_stacks
def error_recovery(self, grammar, stack, typ, value, start_pos, prefix,
def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix,
add_token_callback):
"""
This parser is written in a dynamic way, meaning that this parser
@@ -345,7 +378,7 @@ class ParserWithRecovery(Parser):
stack[index]
#print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
self._stack_removal(grammar, stack, index + 1, value, start_pos)
self._stack_removal(grammar, stack, arcs, index + 1, value, start_pos)
if typ == INDENT:
# For every deleted INDENT we have to delete a DEDENT as well.
# Otherwise the parser will get into trouble and DEDENT too early.
@@ -366,7 +399,7 @@ class ParserWithRecovery(Parser):
# doesn't stop you from defining `continue` in a module, etc.
add_token_callback(typ, value, prefix, start_pos)
def _stack_removal(self, grammar, stack, start_index, value, start_pos):
def _stack_removal(self, grammar, stack, arcs, start_index, value, start_pos):
def clear_names(children):
for c in children:
try:
@@ -393,7 +426,7 @@ class ParserWithRecovery(Parser):
if nodes and nodes[0] in ('def', 'class', 'lambda'):
self._scope_names_stack.pop()
if failed_stack:
err = ErrorStatement(failed_stack, value, self.position_modifier, start_pos)
err = ErrorStatement(failed_stack, arcs, value, self.position_modifier, start_pos)
self._error_statement_stacks.append(err)
self._last_failed_start_pos = start_pos
@@ -418,8 +451,6 @@ class ParserWithRecovery(Parser):
self._add_syntax_error('Strange token', start_pos)
continue
if typ == OP:
typ = token.opmap[value]
yield typ, value, prefix, start_pos
def _add_syntax_error(self, message, position):

View File

@@ -451,7 +451,7 @@ class FastTokenizer(object):
"""
def __init__(self, source):
self.source = source
self._gen = source_tokens(source)
self._gen = source_tokens(source, use_exact_op_types=True)
self._closed = False
# fast parser options

View File

@@ -34,6 +34,18 @@ class InternalParseError(Exception):
self.start_pos = start_pos
def token_to_ilabel(grammar, type_, value):
# Map from token to label
if type_ == tokenize.NAME:
# Check for reserved words (keywords)
try:
return grammar.keywords[value]
except KeyError:
pass
return grammar.tokens[type_]
class PgenParser(object):
"""Parser engine.
@@ -118,15 +130,7 @@ class PgenParser(object):
def addtoken(self, type_, value, prefix, start_pos):
"""Add a token; return True if this is the end of the program."""
# Map from token to label
if type_ == tokenize.NAME:
# Check for reserved words (keywords)
try:
ilabel = self.grammar.keywords[value]
except KeyError:
ilabel = self.grammar.tokens[type_]
else:
ilabel = self.grammar.tokens[type_]
ilabel = token_to_ilabel(self.grammar, type_, value)
# Loop until the token is shifted; may raise exceptions
while True:
@@ -168,7 +172,7 @@ class PgenParser(object):
# Done parsing, but another token is input
raise InternalParseError("too much input", type_, value, start_pos)
else:
self.error_recovery(self.grammar, self.stack, type_,
self.error_recovery(self.grammar, self.stack, arcs, type_,
value, start_pos, prefix, self.addtoken)
break

View File

@@ -14,7 +14,7 @@ from __future__ import absolute_import
import string
import re
from io import StringIO
from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER,
from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
from jedi._compatibility import is_py3
@@ -143,18 +143,19 @@ del _compile
tabsize = 8
# TODO add with?
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
'finally', 'while', 'return')
def source_tokens(source):
def source_tokens(source, use_exact_op_types=False):
"""Generate tokens from a the source code (string)."""
source = source
readline = StringIO(source).readline
return generate_tokens(readline)
return generate_tokens(readline, use_exact_op_types)
def generate_tokens(readline):
def generate_tokens(readline, use_exact_op_types=False):
"""
A heavily modified Python standard library tokenizer.
@@ -285,7 +286,12 @@ def generate_tokens(readline):
paren_level += 1
elif token in ')]}':
paren_level -= 1
yield OP, token, spos, prefix
if use_exact_op_types:
typ = opmap[token]
else:
typ = OP
yield typ, token, spos, prefix
if new_line:
end_pos = lnum + 1, 0

View File

@@ -1,16 +1,20 @@
import re
import os
import keyword
from collections import namedtuple
from jedi import cache
from jedi import common
from jedi.parser import tokenize, ParserWithRecovery
from jedi._compatibility import u
from jedi.parser import token
from jedi.parser.fast import FastParser
from jedi.parser import tree
from jedi import debug
from jedi.common import PushBackIterator
# TODO this should be part of the tokenizer not just of this user_context.
Token = namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])
REPLACE_STR = r"[bBuU]?[rR]?" + (r"(?:(')[^\n'\\]*(?:\\.[^\n'\\]*)*(?:'|$)" +
'|' +
@@ -66,7 +70,7 @@ class UserContext(object):
first_line = common.splitlines(tok_str)[0]
column -= len(first_line)
# Reverse the token again, so that it is in normal order again.
yield typ, tok_str[::-1], (self._line_temp, column), prefix[::-1]
yield Token(typ, tok_str[::-1], (self._line_temp, column), prefix[::-1])
def _calc_path_until_cursor(self, start_pos):
"""
@@ -214,11 +218,14 @@ class UserContext(object):
next_is_key = True
return None, 0, None, (0, 0)
def get_context(self, yield_positions=False):
def get_reverse_context(self, yield_positions=False):
"""
Returns the token strings in reverse order from the start position.
"""
self.get_path_until_cursor() # In case _start_cursor_pos is undefined.
pos = self._start_cursor_pos
while True:
# remove non important white space
# Remove non important white space.
line = self.get_line(pos[0])
while True:
if pos[1] == 0:
@@ -246,6 +253,35 @@ class UserContext(object):
else:
yield ''
def get_backwards_context_tokens(self):
self.get_path_until_cursor() # In case _start_cursor_pos is undefined.
pos = self._start_cursor_pos
while True:
# Remove non important white space.
line = self.get_line(pos[0])
while True:
if pos[1] == 0:
line = self.get_line(pos[0] - 1)
if line and line[-1] == '\\':
pos = pos[0] - 1, len(line) - 1
continue
else:
break
if line[pos[1] - 1].isspace():
pos = pos[0], pos[1] - 1
else:
break
try:
token_ = next(self._get_backwards_tokenizer(pos))
pos = token_.start_pos
yield token_
except StopIteration:
# Make it clear that there's nothing coming anymore.
#yield Token('', token.ENDMARKER, (1, 0), '')
break
def get_line(self, line_nr):
if not self._line_cache:
self._line_cache = common.splitlines(self.source)
@@ -310,7 +346,7 @@ class UserContextParser(object):
# process it - probably a Syntax Error (or in a comment).
debug.warning('No statement under the cursor.')
return
pos = next(self._user_context.get_context(yield_positions=True))
pos = next(self._user_context.get_reverse_context(yield_positions=True))
user_stmt = self.module().get_statement_for_position(pos)
return user_stmt