The tokenizer now includes all newlines and comments in its prefix.

This commit is contained in:
Dave Halter
2014-12-17 20:11:42 +01:00
parent 9cdf6de206
commit b2e54ca1eb
4 changed files with 24 additions and 13 deletions

View File

@@ -5,6 +5,7 @@ the interesting information about completion and goto operations.
"""
import warnings
from itertools import chain
import re
from jedi._compatibility import next, unicode, use_metaclass
from jedi import settings
@@ -579,7 +580,10 @@ class Definition(use_metaclass(CachedMetaClass, BaseDefinition)):
d = d.get_code()
finally:
first_leaf.prefix = old
return d.replace('\n', '').replace('\r', '')
# Delete comments:
d = re.sub('#[^\n]+\n', ' ', d)
# Delete multi spaces/newlines
return re.sub('\s+', ' ', d).strip()
@property
def desc_with_module(self):

View File

@@ -14,8 +14,8 @@ from __future__ import absolute_import
import string
import re
from io import StringIO
from token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, NAME, OP,
ERRORTOKEN, NEWLINE, INDENT, DEDENT)
from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER,
NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
@@ -147,8 +147,11 @@ def source_tokens(source, line_offset=0):
def generate_tokens(readline, line_offset=0):
"""
The original stdlib Python version with minor modifications.
Modified to not care about dedents.
A heavily modified Python standard library tokenizer.
Additionally to the default information, yields also the prefix of each
token. This idea comes from lib2to3. The prefix contains all information
that is irrelevant for the parser like newlines in parentheses or comments.
"""
paren_level = 0 # count parentheses
indents = [0]
@@ -158,6 +161,7 @@ def generate_tokens(readline, line_offset=0):
contline = None
new_line = False
prefix = '' # Should never be required, but here for safety
additional_prefix = ''
while True: # loop over lines in stream
line = readline() # readline returns empty when finished. See StringIO
if not line:
@@ -192,7 +196,8 @@ def generate_tokens(readline, line_offset=0):
pos += 1
continue
prefix = pseudomatch.group(1)
prefix = pseudomatch.group(1) + additional_prefix
additional_prefix = ''
start, pos = pseudomatch.span(2)
spos = (lnum, start)
token, initial = line[start:pos], line[start]
@@ -213,10 +218,12 @@ def generate_tokens(readline, line_offset=0):
elif initial in '\r\n':
if not new_line and paren_level == 0:
yield NEWLINE, token, spos, prefix
else:
additional_prefix = prefix + token
new_line = True
elif initial == '#':
elif initial == '#': # Comments
assert not token.endswith("\n")
#yield Token(COMMENT, token, spos, prefix)
additional_prefix = prefix + token
elif token in triple_quoted:
endprog = endprogs[token]
endmatch = endprog.match(line, pos)

View File

@@ -59,7 +59,7 @@ class UserContext(object):
first_line = (tok_str.splitlines() or [''])[0]
column -= len(first_line)
# Reverse the token again, so that it is in normal order again.
yield typ, tok_str[::-1], (self._line_temp, column), prefix
yield typ, tok_str[::-1], (self._line_temp, column), prefix[::-1]
def _calc_path_until_cursor(self, start_pos):
"""
@@ -122,7 +122,7 @@ class UserContext(object):
break
start_cursor = tok_start_pos
string = tok_str + string
string = tok_str + prefix + string
last_type = tok_type
# Don't need whitespace around a statement.

View File

@@ -181,13 +181,13 @@ ab1(ClassDef);ab2(ClassDef);ab3(ClassDef)
# -----------------
for i in range(1):
#! ['for i in range(1): i']
#! ['for i in range(1): i']
i
for key, value in [(1,2)]:
#! ['for key, value in [(1,2)]: key']
#! ['for key, value in [(1,2)]: key']
key
for i in []:
#! ['for i in []: i']
#! ['for i in []: i']
i