The tokenizer now includes all newlines and comments in its prefix.

This commit is contained in:
Dave Halter
2014-12-17 20:11:42 +01:00
parent 9cdf6de206
commit b2e54ca1eb
4 changed files with 24 additions and 13 deletions

View File

@@ -5,6 +5,7 @@ the interesting information about completion and goto operations.
""" """
import warnings import warnings
from itertools import chain from itertools import chain
import re
from jedi._compatibility import next, unicode, use_metaclass from jedi._compatibility import next, unicode, use_metaclass
from jedi import settings from jedi import settings
@@ -579,7 +580,10 @@ class Definition(use_metaclass(CachedMetaClass, BaseDefinition)):
d = d.get_code() d = d.get_code()
finally: finally:
first_leaf.prefix = old first_leaf.prefix = old
return d.replace('\n', '').replace('\r', '') # Delete comments:
d = re.sub('#[^\n]+\n', ' ', d)
# Delete multi spaces/newlines
return re.sub('\s+', ' ', d).strip()
@property @property
def desc_with_module(self): def desc_with_module(self):

View File

@@ -14,8 +14,8 @@ from __future__ import absolute_import
import string import string
import re import re
from io import StringIO from io import StringIO
from token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, NAME, OP, from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER,
ERRORTOKEN, NEWLINE, INDENT, DEDENT) NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
cookie_re = re.compile("coding[:=]\s*([-\w.]+)") cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
@@ -147,8 +147,11 @@ def source_tokens(source, line_offset=0):
def generate_tokens(readline, line_offset=0): def generate_tokens(readline, line_offset=0):
""" """
The original stdlib Python version with minor modifications. A heavily modified Python standard library tokenizer.
Modified to not care about dedents.
Additionally to the default information, yields also the prefix of each
token. This idea comes from lib2to3. The prefix contains all information
that is irrelevant for the parser like newlines in parentheses or comments.
""" """
paren_level = 0 # count parentheses paren_level = 0 # count parentheses
indents = [0] indents = [0]
@@ -158,6 +161,7 @@ def generate_tokens(readline, line_offset=0):
contline = None contline = None
new_line = False new_line = False
prefix = '' # Should never be required, but here for safety prefix = '' # Should never be required, but here for safety
additional_prefix = ''
while True: # loop over lines in stream while True: # loop over lines in stream
line = readline() # readline returns empty when finished. See StringIO line = readline() # readline returns empty when finished. See StringIO
if not line: if not line:
@@ -192,7 +196,8 @@ def generate_tokens(readline, line_offset=0):
pos += 1 pos += 1
continue continue
prefix = pseudomatch.group(1) prefix = pseudomatch.group(1) + additional_prefix
additional_prefix = ''
start, pos = pseudomatch.span(2) start, pos = pseudomatch.span(2)
spos = (lnum, start) spos = (lnum, start)
token, initial = line[start:pos], line[start] token, initial = line[start:pos], line[start]
@@ -213,10 +218,12 @@ def generate_tokens(readline, line_offset=0):
elif initial in '\r\n': elif initial in '\r\n':
if not new_line and paren_level == 0: if not new_line and paren_level == 0:
yield NEWLINE, token, spos, prefix yield NEWLINE, token, spos, prefix
else:
additional_prefix = prefix + token
new_line = True new_line = True
elif initial == '#': elif initial == '#': # Comments
assert not token.endswith("\n") assert not token.endswith("\n")
#yield Token(COMMENT, token, spos, prefix) additional_prefix = prefix + token
elif token in triple_quoted: elif token in triple_quoted:
endprog = endprogs[token] endprog = endprogs[token]
endmatch = endprog.match(line, pos) endmatch = endprog.match(line, pos)

View File

@@ -59,7 +59,7 @@ class UserContext(object):
first_line = (tok_str.splitlines() or [''])[0] first_line = (tok_str.splitlines() or [''])[0]
column -= len(first_line) column -= len(first_line)
# Reverse the token again, so that it is in normal order again. # Reverse the token again, so that it is in normal order again.
yield typ, tok_str[::-1], (self._line_temp, column), prefix yield typ, tok_str[::-1], (self._line_temp, column), prefix[::-1]
def _calc_path_until_cursor(self, start_pos): def _calc_path_until_cursor(self, start_pos):
""" """
@@ -122,7 +122,7 @@ class UserContext(object):
break break
start_cursor = tok_start_pos start_cursor = tok_start_pos
string = tok_str + string string = tok_str + prefix + string
last_type = tok_type last_type = tok_type
# Don't need whitespace around a statement. # Don't need whitespace around a statement.

View File

@@ -181,13 +181,13 @@ ab1(ClassDef);ab2(ClassDef);ab3(ClassDef)
# ----------------- # -----------------
for i in range(1): for i in range(1):
#! ['for i in range(1): i'] #! ['for i in range(1): i']
i i
for key, value in [(1,2)]: for key, value in [(1,2)]:
#! ['for key, value in [(1,2)]: key'] #! ['for key, value in [(1,2)]: key']
key key
for i in []: for i in []:
#! ['for i in []: i'] #! ['for i in []: i']
i i