1
0
forked from VimPlug/jedi

Add preceding whitespace collection to tokenizer

This commit is contained in:
Joel Wright
2014-07-30 11:59:20 +01:00
parent a01e4c6b37
commit 07d0a43f7e
4 changed files with 117 additions and 53 deletions

View File

@@ -243,7 +243,7 @@ def save_parser(path, name, parser, pickling=True):
class ParserPickling(object): class ParserPickling(object):
version = 13 version = 14
""" """
Version number (integer) for file system cache. Version number (integer) for file system cache.

View File

@@ -37,43 +37,47 @@ tok_name[COMMENT] = 'COMMENT'
class Token(object): class Token(object):
""" """
The token object is an efficient representation of the structure The token object is an efficient representation of the structure
(type, token, (start_pos_line, start_pos_col)). It has indexer (type, token, (start_pos_line, start_pos_col, preceding_whitespace)). It has indexer
methods that maintain compatibility to existing code that expects the above methods that maintain compatibility to existing code that expects the above
structure. structure.
>>> repr(Token(1, "test", (1, 1))) >>> repr(Token(1, "test", (1, 1, '')))
"<Token: ('NAME', 'test', (1, 1))>" "<Token: ('NAME', 'test', (1, 1, ''))>"
>>> Token(1, 'bar', (3, 4)).__getstate__() >>> Token(1, 'bar', (3, 4, '')).__getstate__()
(1, 'bar', 3, 4) (1, 'bar', 3, 4, '')
>>> a = Token(0, 'baz', (0, 0)) >>> a = Token(0, 'baz', (0, 0, ''))
>>> a.__setstate__((1, 'foo', 3, 4)) >>> a.__setstate__((1, 'foo', 3, 4, ''))
>>> a >>> a
<Token: ('NAME', 'foo', (3, 4))> <Token: ('NAME', 'foo', (3, 4, ''))>
>>> a.start_pos >>> a.start_pos
(3, 4) (3, 4)
>>> a.string >>> a.string
'foo' 'foo'
>>> a._start_pos_col >>> a._start_pos_col
4 4
>>> Token(1, u("😷"), (1 ,1)).string + "p" == u("😷p") >>> Token(1, u("😷"), (1 ,1, '')).string + "p" == u("😷p")
True True
""" """
__slots__ = ("type", "string", "_start_pos_line", "_start_pos_col") __slots__ = ("type", "string", "_start_pos_line", "_start_pos_col",
"_preceding_whitespace")
def __init__(self, type, string, start_pos): def __init__(self, type, string, start_pos, whitespace=''):
self.type = type self.type = type
self.string = string self.string = string
self._start_pos_line = start_pos[0] self._start_pos_line = start_pos[0]
self._start_pos_col = start_pos[1] self._start_pos_col = start_pos[1]
self._preceding_whitespace = whitespace
def __repr__(self): def __repr__(self):
typ = tok_name[self.type] typ = tok_name[self.type]
content = typ, self.string, (self._start_pos_line, self._start_pos_col) content = typ, self.string,\
(self._start_pos_line, self._start_pos_col,
self._preceding_whitespace)
return "<%s: %s>" % (type(self).__name__, content) return "<%s: %s>" % (type(self).__name__, content)
@property @property
def start_pos(self): def start_pos(self):
return (self._start_pos_line, self._start_pos_col) return self._start_pos_line, self._start_pos_col
@property @property
def end_pos(self): def end_pos(self):
@@ -94,13 +98,16 @@ class Token(object):
# Make cache footprint smaller for faster unpickling # Make cache footprint smaller for faster unpickling
def __getstate__(self): def __getstate__(self):
return (self.type, self.string, self._start_pos_line, self._start_pos_col) return (self.type, self.string,
self._start_pos_line, self._start_pos_col,
self._preceding_whitespace)
def __setstate__(self, state): def __setstate__(self, state):
self.type = state[0] self.type = state[0]
self.string = state[1] self.string = state[1]
self._start_pos_line = state[2] self._start_pos_line = state[2]
self._start_pos_col = state[3] self._start_pos_col = state[3]
self._preceding_whitespace = state[4]
def group(*choices): def group(*choices):
@@ -158,7 +165,8 @@ cont_str = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n')) group('"', r'\\\r?\n'))
pseudo_extras = group(r'\\\r?\n', comment, triple) pseudo_extras = group(r'\\\r?\n', comment, triple)
pseudo_token = whitespace + group(pseudo_extras, number, funny, cont_str, name) pseudo_token = group(whitespace) + \
group(pseudo_extras, number, funny, cont_str, name)
def _compile(expr): def _compile(expr):
@@ -167,6 +175,7 @@ def _compile(expr):
pseudoprog, single3prog, double3prog = map( pseudoprog, single3prog, double3prog = map(
_compile, (pseudo_token, single3, double3)) _compile, (pseudo_token, single3, double3))
endprogs = {"'": _compile(single), '"': _compile(double), endprogs = {"'": _compile(single), '"': _compile(double),
"'''": single3prog, '"""': double3prog, "'''": single3prog, '"""': double3prog,
"r'''": single3prog, 'r"""': double3prog, "r'''": single3prog, 'r"""': double3prog,
@@ -219,11 +228,12 @@ def generate_tokens(readline, line_offset=0):
numchars = '0123456789' numchars = '0123456789'
contstr = '' contstr = ''
contline = None contline = None
while True: # loop over lines in stream ws = '' # Should never be required, but here for safety
line = readline() # readline returns empty if it's finished. See StringIO while True: # loop over lines in stream
line = readline() # readline returns empty when finished. See StringIO
if not line: if not line:
if contstr: if contstr:
yield Token(ERRORTOKEN, contstr, contstr_start) yield Token(ERRORTOKEN, contstr, contstr_start, whitespace=ws)
break break
lnum += 1 lnum += 1
@@ -233,7 +243,8 @@ def generate_tokens(readline, line_offset=0):
endmatch = endprog.match(line) endmatch = endprog.match(line)
if endmatch: if endmatch:
pos = endmatch.end(0) pos = endmatch.end(0)
yield Token(STRING, contstr + line[:pos], contstr_start) yield Token(STRING, contstr + line[:pos],
contstr_start, whitespace=ws)
contstr = '' contstr = ''
contline = None contline = None
else: else:
@@ -248,32 +259,33 @@ def generate_tokens(readline, line_offset=0):
if line[pos] in '"\'': if line[pos] in '"\'':
# If a literal starts but doesn't end the whole rest of the # If a literal starts but doesn't end the whole rest of the
# line is an error token. # line is an error token.
txt = txt = line[pos:] txt = line[pos:]
yield Token(ERRORTOKEN, txt, (lnum, pos)) yield Token(ERRORTOKEN, txt, (lnum, pos))
pos += 1 pos += 1
continue continue
start, pos = pseudomatch.span(1) ws = pseudomatch.group(1)
start, pos = pseudomatch.span(2)
spos = (lnum, start) spos = (lnum, start)
token, initial = line[start:pos], line[start] token, initial = line[start:pos], line[start]
if (initial in numchars or # ordinary number if (initial in numchars or # ordinary number
(initial == '.' and token != '.' and token != '...')): (initial == '.' and token != '.' and token != '...')):
yield Token(NUMBER, token, spos) yield Token(NUMBER, token, spos, whitespace=ws)
elif initial in '\r\n': elif initial in '\r\n':
yield Token(NEWLINE, token, spos) yield Token(NEWLINE, token, spos, whitespace=ws)
elif initial == '#': elif initial == '#':
assert not token.endswith("\n") assert not token.endswith("\n")
yield Token(COMMENT, token, spos) yield Token(COMMENT, token, spos, whitespace=ws)
elif token in triple_quoted: elif token in triple_quoted:
endprog = endprogs[token] endprog = endprogs[token]
endmatch = endprog.match(line, pos) endmatch = endprog.match(line, pos)
if endmatch: # all on one line if endmatch: # all on one line
pos = endmatch.end(0) pos = endmatch.end(0)
token = line[start:pos] token = line[start:pos]
yield Token(STRING, token, spos) yield Token(STRING, token, spos, whitespace=ws)
else: else:
contstr_start = (lnum, start) # multiple lines contstr_start = (lnum, start) # multiple lines
contstr = line[start:] contstr = line[start:]
contline = line contline = line
break break
@@ -288,12 +300,12 @@ def generate_tokens(readline, line_offset=0):
contline = line contline = line
break break
else: # ordinary string else: # ordinary string
yield Token(STRING, token, spos) yield Token(STRING, token, spos, whitespace=ws)
elif initial in namechars: # ordinary name elif initial in namechars: # ordinary name
yield Token(NAME, token, spos) yield Token(NAME, token, spos, whitespace=ws)
elif initial == '\\' and line[start:] == '\\\n': # continued stmt elif initial == '\\' and line[start:] == '\\\n': # continued stmt
continue continue
else: else:
yield Token(OP, token, spos) yield Token(OP, token, spos, whitespace=ws)
yield Token(ENDMARKER, '', (lnum, 0)) yield Token(ENDMARKER, '', (lnum, 0))

View File

@@ -1,23 +0,0 @@
from jedi._compatibility import u
from jedi import parser
from ..helpers import unittest
class TokenTest(unittest.TestCase):
def test_end_pos_one_line(self):
parsed = parser.Parser(u('''
def testit():
a = "huhu"
'''))
tok = parsed.module.subscopes[0].statements[0]._token_list[2]
self.assertEqual(tok.end_pos, (3, 14))
def test_end_pos_multi_line(self):
parsed = parser.Parser(u('''
def testit():
a = """huhu
asdfasdf""" + "h"
'''))
tok = parsed.module.subscopes[0].statements[0]._token_list[2]
self.assertEqual(tok.end_pos, (4, 11))

View File

@@ -0,0 +1,75 @@
from io import StringIO
from token import NEWLINE, STRING
from jedi._compatibility import u
from jedi import parser
from ..helpers import unittest
class TokenTest(unittest.TestCase):
def test_end_pos_one_line(self):
parsed = parser.Parser(u('''
def testit():
a = "huhu"
'''))
tok = parsed.module.subscopes[0].statements[0]._token_list[2]
self.assertEqual(tok.end_pos, (3, 14))
def test_end_pos_multi_line(self):
parsed = parser.Parser(u('''
def testit():
a = """huhu
asdfasdf""" + "h"
'''))
tok = parsed.module.subscopes[0].statements[0]._token_list[2]
self.assertEqual(tok.end_pos, (4, 11))
def test_simple_no_whitespace(self):
# Test a simple one line string, no preceding whitespace
simple_docstring = '"""simple one line docstring"""'
simple_docstring_io = StringIO(simple_docstring)
tokens = parser.tokenize.generate_tokens(simple_docstring_io.readline)
token_list = list(tokens)
string_token = token_list[0]
self.assertEqual(string_token._preceding_whitespace, '')
self.assertEqual(string_token.string, '"""simple one line docstring"""')
def test_simple_with_whitespace(self):
# Test a simple one line string with preceding whitespace and newline
simple_docstring = ' """simple one line docstring""" \r\n'
simple_docstring_io = StringIO(simple_docstring)
tokens = parser.tokenize.generate_tokens(simple_docstring_io.readline)
token_list = list(tokens)
string_token = token_list[0]
self.assertEqual(string_token._preceding_whitespace, ' ')
self.assertEqual(string_token.string, '"""simple one line docstring"""')
self.assertEqual(string_token.type, STRING)
newline_token = token_list[1]
self.assertEqual(newline_token._preceding_whitespace, ' ')
self.assertEqual(newline_token.type, NEWLINE)
def test_function_whitespace(self):
# Test function definition whitespace identification
fundef = '''def test_whitespace(*args, **kwargs):
x = 1
if x > 0:
print(True)
'''
fundef_io = StringIO(fundef)
tokens = parser.tokenize.generate_tokens(fundef_io.readline)
token_list = list(tokens)
print(token_list)
for t in token_list:
if t.string == 'test_whitespace':
self.assertEqual(t._preceding_whitespace, ' ')
if t.string == '(':
self.assertEqual(t._preceding_whitespace, '')
if t.string == '*':
self.assertEqual(t._preceding_whitespace, '')
if t.string == '**':
self.assertEqual(t._preceding_whitespace, ' ')
if t.string == 'print':
self.assertEqual(t._preceding_whitespace, ' ')
if t.string == 'if':
self.assertEqual(t._preceding_whitespace, ' ')