diff --git a/jedi/cache.py b/jedi/cache.py index ad73c082..e0533150 100644 --- a/jedi/cache.py +++ b/jedi/cache.py @@ -243,7 +243,7 @@ def save_parser(path, name, parser, pickling=True): class ParserPickling(object): - version = 13 + version = 14 """ Version number (integer) for file system cache. diff --git a/jedi/parser/tokenize.py b/jedi/parser/tokenize.py index 40ecab1b..737bbc82 100644 --- a/jedi/parser/tokenize.py +++ b/jedi/parser/tokenize.py @@ -37,43 +37,47 @@ tok_name[COMMENT] = 'COMMENT' class Token(object): """ The token object is an efficient representation of the structure - (type, token, (start_pos_line, start_pos_col)). It has indexer + (type, token, (start_pos_line, start_pos_col, preceding_whitespace)). It has indexer methods that maintain compatibility to existing code that expects the above structure. - >>> repr(Token(1, "test", (1, 1))) - "" - >>> Token(1, 'bar', (3, 4)).__getstate__() - (1, 'bar', 3, 4) - >>> a = Token(0, 'baz', (0, 0)) - >>> a.__setstate__((1, 'foo', 3, 4)) + >>> repr(Token(1, "test", (1, 1, ''))) + "" + >>> Token(1, 'bar', (3, 4, '')).__getstate__() + (1, 'bar', 3, 4, '') + >>> a = Token(0, 'baz', (0, 0, '')) + >>> a.__setstate__((1, 'foo', 3, 4, '')) >>> a - + >>> a.start_pos (3, 4) >>> a.string 'foo' >>> a._start_pos_col 4 - >>> Token(1, u("😷"), (1 ,1)).string + "p" == u("😷p") + >>> Token(1, u("😷"), (1 ,1, '')).string + "p" == u("😷p") True """ - __slots__ = ("type", "string", "_start_pos_line", "_start_pos_col") + __slots__ = ("type", "string", "_start_pos_line", "_start_pos_col", + "_preceding_whitespace") - def __init__(self, type, string, start_pos): + def __init__(self, type, string, start_pos, whitespace=''): self.type = type self.string = string self._start_pos_line = start_pos[0] self._start_pos_col = start_pos[1] + self._preceding_whitespace = whitespace def __repr__(self): typ = tok_name[self.type] - content = typ, self.string, (self._start_pos_line, self._start_pos_col) + content = typ, self.string,\ + (self._start_pos_line, self._start_pos_col, + self._preceding_whitespace) return "<%s: %s>" % (type(self).__name__, content) @property def start_pos(self): - return (self._start_pos_line, self._start_pos_col) + return self._start_pos_line, self._start_pos_col @property def end_pos(self): @@ -94,13 +98,16 @@ class Token(object): # Make cache footprint smaller for faster unpickling def __getstate__(self): - return (self.type, self.string, self._start_pos_line, self._start_pos_col) + return (self.type, self.string, + self._start_pos_line, self._start_pos_col, + self._preceding_whitespace) def __setstate__(self, state): self.type = state[0] self.string = state[1] self._start_pos_line = state[2] self._start_pos_col = state[3] + self._preceding_whitespace = state[4] def group(*choices): @@ -158,7 +165,8 @@ cont_str = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n')) pseudo_extras = group(r'\\\r?\n', comment, triple) -pseudo_token = whitespace + group(pseudo_extras, number, funny, cont_str, name) +pseudo_token = group(whitespace) + \ + group(pseudo_extras, number, funny, cont_str, name) def _compile(expr): @@ -167,6 +175,7 @@ def _compile(expr): pseudoprog, single3prog, double3prog = map( _compile, (pseudo_token, single3, double3)) + endprogs = {"'": _compile(single), '"': _compile(double), "'''": single3prog, '"""': double3prog, "r'''": single3prog, 'r"""': double3prog, @@ -219,11 +228,12 @@ def generate_tokens(readline, line_offset=0): numchars = '0123456789' contstr = '' contline = None - while True: # loop over lines in stream - line = readline() # readline returns empty if it's finished. See StringIO + ws = '' # Should never be required, but here for safety + while True: # loop over lines in stream + line = readline() # readline returns empty when finished. See StringIO if not line: if contstr: - yield Token(ERRORTOKEN, contstr, contstr_start) + yield Token(ERRORTOKEN, contstr, contstr_start, whitespace=ws) break lnum += 1 @@ -233,7 +243,8 @@ def generate_tokens(readline, line_offset=0): endmatch = endprog.match(line) if endmatch: pos = endmatch.end(0) - yield Token(STRING, contstr + line[:pos], contstr_start) + yield Token(STRING, contstr + line[:pos], + contstr_start, whitespace=ws) contstr = '' contline = None else: @@ -248,32 +259,33 @@ def generate_tokens(readline, line_offset=0): if line[pos] in '"\'': # If a literal starts but doesn't end the whole rest of the # line is an error token. - txt = txt = line[pos:] + txt = line[pos:] yield Token(ERRORTOKEN, txt, (lnum, pos)) pos += 1 continue - start, pos = pseudomatch.span(1) + ws = pseudomatch.group(1) + start, pos = pseudomatch.span(2) spos = (lnum, start) token, initial = line[start:pos], line[start] if (initial in numchars or # ordinary number (initial == '.' and token != '.' and token != '...')): - yield Token(NUMBER, token, spos) + yield Token(NUMBER, token, spos, whitespace=ws) elif initial in '\r\n': - yield Token(NEWLINE, token, spos) + yield Token(NEWLINE, token, spos, whitespace=ws) elif initial == '#': assert not token.endswith("\n") - yield Token(COMMENT, token, spos) + yield Token(COMMENT, token, spos, whitespace=ws) elif token in triple_quoted: endprog = endprogs[token] endmatch = endprog.match(line, pos) if endmatch: # all on one line pos = endmatch.end(0) token = line[start:pos] - yield Token(STRING, token, spos) + yield Token(STRING, token, spos, whitespace=ws) else: - contstr_start = (lnum, start) # multiple lines + contstr_start = (lnum, start) # multiple lines contstr = line[start:] contline = line break @@ -288,12 +300,12 @@ def generate_tokens(readline, line_offset=0): contline = line break else: # ordinary string - yield Token(STRING, token, spos) + yield Token(STRING, token, spos, whitespace=ws) elif initial in namechars: # ordinary name - yield Token(NAME, token, spos) + yield Token(NAME, token, spos, whitespace=ws) elif initial == '\\' and line[start:] == '\\\n': # continued stmt continue else: - yield Token(OP, token, spos) + yield Token(OP, token, spos, whitespace=ws) yield Token(ENDMARKER, '', (lnum, 0)) diff --git a/test/test_parser/test_token.py b/test/test_parser/test_token.py deleted file mode 100644 index 0295040d..00000000 --- a/test/test_parser/test_token.py +++ /dev/null @@ -1,23 +0,0 @@ -from jedi._compatibility import u -from jedi import parser - -from ..helpers import unittest - - -class TokenTest(unittest.TestCase): - def test_end_pos_one_line(self): - parsed = parser.Parser(u(''' -def testit(): - a = "huhu" -''')) - tok = parsed.module.subscopes[0].statements[0]._token_list[2] - self.assertEqual(tok.end_pos, (3, 14)) - - def test_end_pos_multi_line(self): - parsed = parser.Parser(u(''' -def testit(): - a = """huhu -asdfasdf""" + "h" -''')) - tok = parsed.module.subscopes[0].statements[0]._token_list[2] - self.assertEqual(tok.end_pos, (4, 11)) diff --git a/test/test_parser/test_tokenizer.py b/test/test_parser/test_tokenizer.py new file mode 100644 index 00000000..35ec6abe --- /dev/null +++ b/test/test_parser/test_tokenizer.py @@ -0,0 +1,75 @@ +from io import StringIO +from token import NEWLINE, STRING + +from jedi._compatibility import u +from jedi import parser + +from ..helpers import unittest + + +class TokenTest(unittest.TestCase): + def test_end_pos_one_line(self): + parsed = parser.Parser(u(''' +def testit(): + a = "huhu" +''')) + tok = parsed.module.subscopes[0].statements[0]._token_list[2] + self.assertEqual(tok.end_pos, (3, 14)) + + def test_end_pos_multi_line(self): + parsed = parser.Parser(u(''' +def testit(): + a = """huhu +asdfasdf""" + "h" +''')) + tok = parsed.module.subscopes[0].statements[0]._token_list[2] + self.assertEqual(tok.end_pos, (4, 11)) + + def test_simple_no_whitespace(self): + # Test a simple one line string, no preceding whitespace + simple_docstring = '"""simple one line docstring"""' + simple_docstring_io = StringIO(simple_docstring) + tokens = parser.tokenize.generate_tokens(simple_docstring_io.readline) + token_list = list(tokens) + string_token = token_list[0] + self.assertEqual(string_token._preceding_whitespace, '') + self.assertEqual(string_token.string, '"""simple one line docstring"""') + + def test_simple_with_whitespace(self): + # Test a simple one line string with preceding whitespace and newline + simple_docstring = ' """simple one line docstring""" \r\n' + simple_docstring_io = StringIO(simple_docstring) + tokens = parser.tokenize.generate_tokens(simple_docstring_io.readline) + token_list = list(tokens) + string_token = token_list[0] + self.assertEqual(string_token._preceding_whitespace, ' ') + self.assertEqual(string_token.string, '"""simple one line docstring"""') + self.assertEqual(string_token.type, STRING) + newline_token = token_list[1] + self.assertEqual(newline_token._preceding_whitespace, ' ') + self.assertEqual(newline_token.type, NEWLINE) + + def test_function_whitespace(self): + # Test function definition whitespace identification + fundef = '''def test_whitespace(*args, **kwargs): + x = 1 + if x > 0: + print(True) +''' + fundef_io = StringIO(fundef) + tokens = parser.tokenize.generate_tokens(fundef_io.readline) + token_list = list(tokens) + print(token_list) + for t in token_list: + if t.string == 'test_whitespace': + self.assertEqual(t._preceding_whitespace, ' ') + if t.string == '(': + self.assertEqual(t._preceding_whitespace, '') + if t.string == '*': + self.assertEqual(t._preceding_whitespace, '') + if t.string == '**': + self.assertEqual(t._preceding_whitespace, ' ') + if t.string == 'print': + self.assertEqual(t._preceding_whitespace, ' ') + if t.string == 'if': + self.assertEqual(t._preceding_whitespace, ' ')