forked from VimPlug/jedi
Add preceding whitespace collection to tokenizer
This commit is contained in:
@@ -243,7 +243,7 @@ def save_parser(path, name, parser, pickling=True):
|
|||||||
|
|
||||||
class ParserPickling(object):
|
class ParserPickling(object):
|
||||||
|
|
||||||
version = 13
|
version = 14
|
||||||
"""
|
"""
|
||||||
Version number (integer) for file system cache.
|
Version number (integer) for file system cache.
|
||||||
|
|
||||||
|
|||||||
@@ -37,43 +37,47 @@ tok_name[COMMENT] = 'COMMENT'
|
|||||||
class Token(object):
|
class Token(object):
|
||||||
"""
|
"""
|
||||||
The token object is an efficient representation of the structure
|
The token object is an efficient representation of the structure
|
||||||
(type, token, (start_pos_line, start_pos_col)). It has indexer
|
(type, token, (start_pos_line, start_pos_col, preceding_whitespace)). It has indexer
|
||||||
methods that maintain compatibility to existing code that expects the above
|
methods that maintain compatibility to existing code that expects the above
|
||||||
structure.
|
structure.
|
||||||
|
|
||||||
>>> repr(Token(1, "test", (1, 1)))
|
>>> repr(Token(1, "test", (1, 1, '')))
|
||||||
"<Token: ('NAME', 'test', (1, 1))>"
|
"<Token: ('NAME', 'test', (1, 1, ''))>"
|
||||||
>>> Token(1, 'bar', (3, 4)).__getstate__()
|
>>> Token(1, 'bar', (3, 4, '')).__getstate__()
|
||||||
(1, 'bar', 3, 4)
|
(1, 'bar', 3, 4, '')
|
||||||
>>> a = Token(0, 'baz', (0, 0))
|
>>> a = Token(0, 'baz', (0, 0, ''))
|
||||||
>>> a.__setstate__((1, 'foo', 3, 4))
|
>>> a.__setstate__((1, 'foo', 3, 4, ''))
|
||||||
>>> a
|
>>> a
|
||||||
<Token: ('NAME', 'foo', (3, 4))>
|
<Token: ('NAME', 'foo', (3, 4, ''))>
|
||||||
>>> a.start_pos
|
>>> a.start_pos
|
||||||
(3, 4)
|
(3, 4)
|
||||||
>>> a.string
|
>>> a.string
|
||||||
'foo'
|
'foo'
|
||||||
>>> a._start_pos_col
|
>>> a._start_pos_col
|
||||||
4
|
4
|
||||||
>>> Token(1, u("😷"), (1 ,1)).string + "p" == u("😷p")
|
>>> Token(1, u("😷"), (1 ,1, '')).string + "p" == u("😷p")
|
||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
__slots__ = ("type", "string", "_start_pos_line", "_start_pos_col")
|
__slots__ = ("type", "string", "_start_pos_line", "_start_pos_col",
|
||||||
|
"_preceding_whitespace")
|
||||||
|
|
||||||
def __init__(self, type, string, start_pos):
|
def __init__(self, type, string, start_pos, whitespace=''):
|
||||||
self.type = type
|
self.type = type
|
||||||
self.string = string
|
self.string = string
|
||||||
self._start_pos_line = start_pos[0]
|
self._start_pos_line = start_pos[0]
|
||||||
self._start_pos_col = start_pos[1]
|
self._start_pos_col = start_pos[1]
|
||||||
|
self._preceding_whitespace = whitespace
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
typ = tok_name[self.type]
|
typ = tok_name[self.type]
|
||||||
content = typ, self.string, (self._start_pos_line, self._start_pos_col)
|
content = typ, self.string,\
|
||||||
|
(self._start_pos_line, self._start_pos_col,
|
||||||
|
self._preceding_whitespace)
|
||||||
return "<%s: %s>" % (type(self).__name__, content)
|
return "<%s: %s>" % (type(self).__name__, content)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def start_pos(self):
|
def start_pos(self):
|
||||||
return (self._start_pos_line, self._start_pos_col)
|
return self._start_pos_line, self._start_pos_col
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def end_pos(self):
|
def end_pos(self):
|
||||||
@@ -94,13 +98,16 @@ class Token(object):
|
|||||||
|
|
||||||
# Make cache footprint smaller for faster unpickling
|
# Make cache footprint smaller for faster unpickling
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
return (self.type, self.string, self._start_pos_line, self._start_pos_col)
|
return (self.type, self.string,
|
||||||
|
self._start_pos_line, self._start_pos_col,
|
||||||
|
self._preceding_whitespace)
|
||||||
|
|
||||||
def __setstate__(self, state):
|
def __setstate__(self, state):
|
||||||
self.type = state[0]
|
self.type = state[0]
|
||||||
self.string = state[1]
|
self.string = state[1]
|
||||||
self._start_pos_line = state[2]
|
self._start_pos_line = state[2]
|
||||||
self._start_pos_col = state[3]
|
self._start_pos_col = state[3]
|
||||||
|
self._preceding_whitespace = state[4]
|
||||||
|
|
||||||
|
|
||||||
def group(*choices):
|
def group(*choices):
|
||||||
@@ -158,7 +165,8 @@ cont_str = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
|
|||||||
r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
|
r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
|
||||||
group('"', r'\\\r?\n'))
|
group('"', r'\\\r?\n'))
|
||||||
pseudo_extras = group(r'\\\r?\n', comment, triple)
|
pseudo_extras = group(r'\\\r?\n', comment, triple)
|
||||||
pseudo_token = whitespace + group(pseudo_extras, number, funny, cont_str, name)
|
pseudo_token = group(whitespace) + \
|
||||||
|
group(pseudo_extras, number, funny, cont_str, name)
|
||||||
|
|
||||||
|
|
||||||
def _compile(expr):
|
def _compile(expr):
|
||||||
@@ -167,6 +175,7 @@ def _compile(expr):
|
|||||||
|
|
||||||
pseudoprog, single3prog, double3prog = map(
|
pseudoprog, single3prog, double3prog = map(
|
||||||
_compile, (pseudo_token, single3, double3))
|
_compile, (pseudo_token, single3, double3))
|
||||||
|
|
||||||
endprogs = {"'": _compile(single), '"': _compile(double),
|
endprogs = {"'": _compile(single), '"': _compile(double),
|
||||||
"'''": single3prog, '"""': double3prog,
|
"'''": single3prog, '"""': double3prog,
|
||||||
"r'''": single3prog, 'r"""': double3prog,
|
"r'''": single3prog, 'r"""': double3prog,
|
||||||
@@ -219,11 +228,12 @@ def generate_tokens(readline, line_offset=0):
|
|||||||
numchars = '0123456789'
|
numchars = '0123456789'
|
||||||
contstr = ''
|
contstr = ''
|
||||||
contline = None
|
contline = None
|
||||||
while True: # loop over lines in stream
|
ws = '' # Should never be required, but here for safety
|
||||||
line = readline() # readline returns empty if it's finished. See StringIO
|
while True: # loop over lines in stream
|
||||||
|
line = readline() # readline returns empty when finished. See StringIO
|
||||||
if not line:
|
if not line:
|
||||||
if contstr:
|
if contstr:
|
||||||
yield Token(ERRORTOKEN, contstr, contstr_start)
|
yield Token(ERRORTOKEN, contstr, contstr_start, whitespace=ws)
|
||||||
break
|
break
|
||||||
|
|
||||||
lnum += 1
|
lnum += 1
|
||||||
@@ -233,7 +243,8 @@ def generate_tokens(readline, line_offset=0):
|
|||||||
endmatch = endprog.match(line)
|
endmatch = endprog.match(line)
|
||||||
if endmatch:
|
if endmatch:
|
||||||
pos = endmatch.end(0)
|
pos = endmatch.end(0)
|
||||||
yield Token(STRING, contstr + line[:pos], contstr_start)
|
yield Token(STRING, contstr + line[:pos],
|
||||||
|
contstr_start, whitespace=ws)
|
||||||
contstr = ''
|
contstr = ''
|
||||||
contline = None
|
contline = None
|
||||||
else:
|
else:
|
||||||
@@ -248,32 +259,33 @@ def generate_tokens(readline, line_offset=0):
|
|||||||
if line[pos] in '"\'':
|
if line[pos] in '"\'':
|
||||||
# If a literal starts but doesn't end the whole rest of the
|
# If a literal starts but doesn't end the whole rest of the
|
||||||
# line is an error token.
|
# line is an error token.
|
||||||
txt = txt = line[pos:]
|
txt = line[pos:]
|
||||||
yield Token(ERRORTOKEN, txt, (lnum, pos))
|
yield Token(ERRORTOKEN, txt, (lnum, pos))
|
||||||
pos += 1
|
pos += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
start, pos = pseudomatch.span(1)
|
ws = pseudomatch.group(1)
|
||||||
|
start, pos = pseudomatch.span(2)
|
||||||
spos = (lnum, start)
|
spos = (lnum, start)
|
||||||
token, initial = line[start:pos], line[start]
|
token, initial = line[start:pos], line[start]
|
||||||
|
|
||||||
if (initial in numchars or # ordinary number
|
if (initial in numchars or # ordinary number
|
||||||
(initial == '.' and token != '.' and token != '...')):
|
(initial == '.' and token != '.' and token != '...')):
|
||||||
yield Token(NUMBER, token, spos)
|
yield Token(NUMBER, token, spos, whitespace=ws)
|
||||||
elif initial in '\r\n':
|
elif initial in '\r\n':
|
||||||
yield Token(NEWLINE, token, spos)
|
yield Token(NEWLINE, token, spos, whitespace=ws)
|
||||||
elif initial == '#':
|
elif initial == '#':
|
||||||
assert not token.endswith("\n")
|
assert not token.endswith("\n")
|
||||||
yield Token(COMMENT, token, spos)
|
yield Token(COMMENT, token, spos, whitespace=ws)
|
||||||
elif token in triple_quoted:
|
elif token in triple_quoted:
|
||||||
endprog = endprogs[token]
|
endprog = endprogs[token]
|
||||||
endmatch = endprog.match(line, pos)
|
endmatch = endprog.match(line, pos)
|
||||||
if endmatch: # all on one line
|
if endmatch: # all on one line
|
||||||
pos = endmatch.end(0)
|
pos = endmatch.end(0)
|
||||||
token = line[start:pos]
|
token = line[start:pos]
|
||||||
yield Token(STRING, token, spos)
|
yield Token(STRING, token, spos, whitespace=ws)
|
||||||
else:
|
else:
|
||||||
contstr_start = (lnum, start) # multiple lines
|
contstr_start = (lnum, start) # multiple lines
|
||||||
contstr = line[start:]
|
contstr = line[start:]
|
||||||
contline = line
|
contline = line
|
||||||
break
|
break
|
||||||
@@ -288,12 +300,12 @@ def generate_tokens(readline, line_offset=0):
|
|||||||
contline = line
|
contline = line
|
||||||
break
|
break
|
||||||
else: # ordinary string
|
else: # ordinary string
|
||||||
yield Token(STRING, token, spos)
|
yield Token(STRING, token, spos, whitespace=ws)
|
||||||
elif initial in namechars: # ordinary name
|
elif initial in namechars: # ordinary name
|
||||||
yield Token(NAME, token, spos)
|
yield Token(NAME, token, spos, whitespace=ws)
|
||||||
elif initial == '\\' and line[start:] == '\\\n': # continued stmt
|
elif initial == '\\' and line[start:] == '\\\n': # continued stmt
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
yield Token(OP, token, spos)
|
yield Token(OP, token, spos, whitespace=ws)
|
||||||
|
|
||||||
yield Token(ENDMARKER, '', (lnum, 0))
|
yield Token(ENDMARKER, '', (lnum, 0))
|
||||||
|
|||||||
@@ -1,23 +0,0 @@
|
|||||||
from jedi._compatibility import u
|
|
||||||
from jedi import parser
|
|
||||||
|
|
||||||
from ..helpers import unittest
|
|
||||||
|
|
||||||
|
|
||||||
class TokenTest(unittest.TestCase):
|
|
||||||
def test_end_pos_one_line(self):
|
|
||||||
parsed = parser.Parser(u('''
|
|
||||||
def testit():
|
|
||||||
a = "huhu"
|
|
||||||
'''))
|
|
||||||
tok = parsed.module.subscopes[0].statements[0]._token_list[2]
|
|
||||||
self.assertEqual(tok.end_pos, (3, 14))
|
|
||||||
|
|
||||||
def test_end_pos_multi_line(self):
|
|
||||||
parsed = parser.Parser(u('''
|
|
||||||
def testit():
|
|
||||||
a = """huhu
|
|
||||||
asdfasdf""" + "h"
|
|
||||||
'''))
|
|
||||||
tok = parsed.module.subscopes[0].statements[0]._token_list[2]
|
|
||||||
self.assertEqual(tok.end_pos, (4, 11))
|
|
||||||
75
test/test_parser/test_tokenizer.py
Normal file
75
test/test_parser/test_tokenizer.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
from io import StringIO
|
||||||
|
from token import NEWLINE, STRING
|
||||||
|
|
||||||
|
from jedi._compatibility import u
|
||||||
|
from jedi import parser
|
||||||
|
|
||||||
|
from ..helpers import unittest
|
||||||
|
|
||||||
|
|
||||||
|
class TokenTest(unittest.TestCase):
|
||||||
|
def test_end_pos_one_line(self):
|
||||||
|
parsed = parser.Parser(u('''
|
||||||
|
def testit():
|
||||||
|
a = "huhu"
|
||||||
|
'''))
|
||||||
|
tok = parsed.module.subscopes[0].statements[0]._token_list[2]
|
||||||
|
self.assertEqual(tok.end_pos, (3, 14))
|
||||||
|
|
||||||
|
def test_end_pos_multi_line(self):
|
||||||
|
parsed = parser.Parser(u('''
|
||||||
|
def testit():
|
||||||
|
a = """huhu
|
||||||
|
asdfasdf""" + "h"
|
||||||
|
'''))
|
||||||
|
tok = parsed.module.subscopes[0].statements[0]._token_list[2]
|
||||||
|
self.assertEqual(tok.end_pos, (4, 11))
|
||||||
|
|
||||||
|
def test_simple_no_whitespace(self):
|
||||||
|
# Test a simple one line string, no preceding whitespace
|
||||||
|
simple_docstring = '"""simple one line docstring"""'
|
||||||
|
simple_docstring_io = StringIO(simple_docstring)
|
||||||
|
tokens = parser.tokenize.generate_tokens(simple_docstring_io.readline)
|
||||||
|
token_list = list(tokens)
|
||||||
|
string_token = token_list[0]
|
||||||
|
self.assertEqual(string_token._preceding_whitespace, '')
|
||||||
|
self.assertEqual(string_token.string, '"""simple one line docstring"""')
|
||||||
|
|
||||||
|
def test_simple_with_whitespace(self):
|
||||||
|
# Test a simple one line string with preceding whitespace and newline
|
||||||
|
simple_docstring = ' """simple one line docstring""" \r\n'
|
||||||
|
simple_docstring_io = StringIO(simple_docstring)
|
||||||
|
tokens = parser.tokenize.generate_tokens(simple_docstring_io.readline)
|
||||||
|
token_list = list(tokens)
|
||||||
|
string_token = token_list[0]
|
||||||
|
self.assertEqual(string_token._preceding_whitespace, ' ')
|
||||||
|
self.assertEqual(string_token.string, '"""simple one line docstring"""')
|
||||||
|
self.assertEqual(string_token.type, STRING)
|
||||||
|
newline_token = token_list[1]
|
||||||
|
self.assertEqual(newline_token._preceding_whitespace, ' ')
|
||||||
|
self.assertEqual(newline_token.type, NEWLINE)
|
||||||
|
|
||||||
|
def test_function_whitespace(self):
|
||||||
|
# Test function definition whitespace identification
|
||||||
|
fundef = '''def test_whitespace(*args, **kwargs):
|
||||||
|
x = 1
|
||||||
|
if x > 0:
|
||||||
|
print(True)
|
||||||
|
'''
|
||||||
|
fundef_io = StringIO(fundef)
|
||||||
|
tokens = parser.tokenize.generate_tokens(fundef_io.readline)
|
||||||
|
token_list = list(tokens)
|
||||||
|
print(token_list)
|
||||||
|
for t in token_list:
|
||||||
|
if t.string == 'test_whitespace':
|
||||||
|
self.assertEqual(t._preceding_whitespace, ' ')
|
||||||
|
if t.string == '(':
|
||||||
|
self.assertEqual(t._preceding_whitespace, '')
|
||||||
|
if t.string == '*':
|
||||||
|
self.assertEqual(t._preceding_whitespace, '')
|
||||||
|
if t.string == '**':
|
||||||
|
self.assertEqual(t._preceding_whitespace, ' ')
|
||||||
|
if t.string == 'print':
|
||||||
|
self.assertEqual(t._preceding_whitespace, ' ')
|
||||||
|
if t.string == 'if':
|
||||||
|
self.assertEqual(t._preceding_whitespace, ' ')
|
||||||
Reference in New Issue
Block a user