1
0
forked from VimPlug/jedi

Merge branch 'dev' into unicode_tokenize_fix2

Conflicts:
	AUTHORS.txt
This commit is contained in:
farhad
2015-03-06 12:14:38 +04:00
105 changed files with 7728 additions and 6520 deletions

View File

@@ -14,95 +14,26 @@ from __future__ import absolute_import
import string
import re
from io import StringIO
from token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, NAME, OP,
ERRORTOKEN, NEWLINE)
from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER,
NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
from jedi._compatibility import is_py3
from jedi._compatibility import u
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
# From here on we have custom stuff (everything before was originally Python
# internal code).
FLOWS = ['if', 'else', 'elif', 'while', 'with', 'try', 'except', 'finally']
namechars = string.ascii_letters + '_'
if is_py3:
# Python 3 has str.isidentifier() to check if a char is a valid identifier
is_identifier = str.isidentifier
else:
namechars = string.ascii_letters + '_'
is_identifier = lambda s: s in namechars
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
class Token(object):
"""
The token object is an efficient representation of the structure
(type, token, (start_pos_line, start_pos_col)). It has indexer
methods that maintain compatibility to existing code that expects the above
structure.
>>> repr(Token(1, "test", (1, 1)))
"<Token: ('NAME', 'test', (1, 1))>"
>>> Token(1, 'bar', (3, 4)).__getstate__()
(1, 'bar', 3, 4)
>>> a = Token(0, 'baz', (0, 0))
>>> a.__setstate__((1, 'foo', 3, 4))
>>> a
<Token: ('NAME', 'foo', (3, 4))>
>>> a.start_pos
(3, 4)
>>> a.string
'foo'
>>> a._start_pos_col
4
>>> Token(1, u("😷"), (1 ,1)).string + "p" == u("😷p")
True
"""
__slots__ = ("type", "string", "_start_pos_line", "_start_pos_col")
def __init__(self, type, string, start_pos):
self.type = type
self.string = string
self._start_pos_line = start_pos[0]
self._start_pos_col = start_pos[1]
def __repr__(self):
typ = tok_name[self.type]
content = typ, self.string, (self._start_pos_line, self._start_pos_col)
return "<%s: %s>" % (type(self).__name__, content)
@property
def start_pos(self):
return (self._start_pos_line, self._start_pos_col)
@property
def end_pos(self):
"""Returns end position respecting multiline tokens."""
end_pos_line = self._start_pos_line
lines = self.string.split('\n')
if self.string.endswith('\n'):
lines = lines[:-1]
lines[-1] += '\n'
end_pos_line += len(lines) - 1
end_pos_col = self._start_pos_col
# Check for multiline token
if self._start_pos_line == end_pos_line:
end_pos_col += len(lines[-1])
else:
end_pos_col = len(lines[-1])
return (end_pos_line, end_pos_col)
# Make cache footprint smaller for faster unpickling
def __getstate__(self):
return (self.type, self.string, self._start_pos_line, self._start_pos_col)
def __setstate__(self, state):
self.type = state[0]
self.string = state[1]
self._start_pos_line = state[2]
self._start_pos_col = state[3]
def group(*choices):
return '(' + '|'.join(choices) + ')'
@@ -158,7 +89,8 @@ cont_str = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n'))
pseudo_extras = group(r'\\\r?\n', comment, triple)
pseudo_token = whitespace + group(pseudo_extras, number, funny, cont_str, name)
pseudo_token = group(whitespace) + \
group(pseudo_extras, number, funny, cont_str, name)
def _compile(expr):
@@ -167,6 +99,7 @@ def _compile(expr):
pseudoprog, single3prog, double3prog = map(
_compile, (pseudo_token, single3, double3))
endprogs = {"'": _compile(single), '"': _compile(double),
"'''": single3prog, '"""': double3prog,
"r'''": single3prog, 'r"""': double3prog,
@@ -202,28 +135,43 @@ del _compile
tabsize = 8
ALWAYS_BREAK_TOKENS = (';', 'import', 'from', 'class', 'def', 'try', 'except',
'finally', 'while', 'return')
def source_tokens(source, line_offset=0):
def source_tokens(source):
"""Generate tokens from a the source code (string)."""
source = source + '\n' # end with \n, because the parser needs it
readline = StringIO(source).readline
return generate_tokens(readline, line_offset)
return generate_tokens(readline)
def generate_tokens(readline, line_offset=0):
def generate_tokens(readline):
"""
The original stdlib Python version with minor modifications.
Modified to not care about dedents.
A heavily modified Python standard library tokenizer.
Additionally to the default information, yields also the prefix of each
token. This idea comes from lib2to3. The prefix contains all information
that is irrelevant for the parser like newlines in parentheses or comments.
"""
lnum = line_offset
paren_level = 0 # count parentheses
indents = [0]
lnum = 0
numchars = '0123456789'
contstr = ''
contline = None
while True: # loop over lines in stream
line = readline() # readline returns empty if it's finished. See StringIO
# We start with a newline. This makes indent at the first position
# possible. It's not valid Python, but still better than an INDENT in the
# second line (and not in the first). This makes quite a few things in
# Jedi's fast parser possible.
new_line = True
prefix = '' # Should never be required, but here for safety
additional_prefix = ''
while True: # loop over lines in stream
line = readline() # readline returns empty when finished. See StringIO
if not line:
if contstr:
yield Token(ERRORTOKEN, contstr, contstr_start)
yield ERRORTOKEN, contstr, contstr_start, prefix
break
lnum += 1
@@ -233,7 +181,7 @@ def generate_tokens(readline, line_offset=0):
endmatch = endprog.match(line)
if endmatch:
pos = endmatch.end(0)
yield Token(STRING, contstr + line[:pos], contstr_start)
yield STRING, contstr + line[:pos], contstr_start, prefix
contstr = ''
contline = None
else:
@@ -248,32 +196,48 @@ def generate_tokens(readline, line_offset=0):
if line[pos] in '"\'':
# If a literal starts but doesn't end the whole rest of the
# line is an error token.
txt = txt = line[pos:]
yield Token(ERRORTOKEN, txt, (lnum, pos))
txt = line[pos:]
yield ERRORTOKEN, txt, (lnum, pos), prefix
pos += 1
continue
start, pos = pseudomatch.span(1)
prefix = additional_prefix + pseudomatch.group(1)
additional_prefix = ''
start, pos = pseudomatch.span(2)
spos = (lnum, start)
token, initial = line[start:pos], line[start]
if new_line and initial not in '\r\n#':
new_line = False
if paren_level == 0:
if start > indents[-1]:
yield INDENT, '', spos, ''
indents.append(start)
while start < indents[-1]:
yield DEDENT, '', spos, ''
indents.pop()
if (initial in numchars or # ordinary number
(initial == '.' and token != '.' and token != '...')):
yield Token(NUMBER, token, spos)
yield NUMBER, token, spos, prefix
elif initial in '\r\n':
yield Token(NEWLINE, token, spos)
elif initial == '#':
if not new_line and paren_level == 0:
yield NEWLINE, token, spos, prefix
else:
additional_prefix = prefix + token
new_line = True
elif initial == '#': # Comments
assert not token.endswith("\n")
yield Token(COMMENT, token, spos)
additional_prefix = prefix + token
elif token in triple_quoted:
endprog = endprogs[token]
endmatch = endprog.match(line, pos)
if endmatch: # all on one line
pos = endmatch.end(0)
token = line[start:pos]
yield Token(STRING, token, spos)
yield STRING, token, spos, prefix
else:
contstr_start = (lnum, start) # multiple lines
contstr_start = (lnum, start) # multiple lines
contstr = line[start:]
contline = line
break
@@ -288,12 +252,28 @@ def generate_tokens(readline, line_offset=0):
contline = line
break
else: # ordinary string
yield Token(STRING, token, spos)
elif initial in namechars: # ordinary name
yield Token(NAME, token, spos)
yield STRING, token, spos, prefix
elif is_identifier(initial): # ordinary name
if token in ALWAYS_BREAK_TOKENS:
paren_level = 0
while True:
indent = indents.pop()
if indent > start:
yield DEDENT, '', spos, ''
else:
indents.append(indent)
break
yield NAME, token, spos, prefix
elif initial == '\\' and line[start:] == '\\\n': # continued stmt
continue
additional_prefix += prefix + line[start:]
break
else:
yield Token(OP, token, spos)
if token in '([{':
paren_level += 1
elif token in ')]}':
paren_level -= 1
yield OP, token, spos, prefix
yield Token(ENDMARKER, '', (lnum, 0))
for indent in indents[1:]:
yield DEDENT, '', spos, ''
yield ENDMARKER, '', spos, prefix