diff --git a/jedi/parser/diff.py b/jedi/parser/diff.py index 6d28f062..f255fdb4 100644 --- a/jedi/parser/diff.py +++ b/jedi/parser/diff.py @@ -356,8 +356,7 @@ class DiffParser(object): is_first_token = True omitted_first_indent = False indents = [] - l = iter(lines) - tokens = generate_tokens(lambda: next(l, ''), use_exact_op_types=True) + tokens = generate_tokens(lines, use_exact_op_types=True) stack = self._active_parser.pgen_parser.stack for typ, string, start_pos, prefix in tokens: start_pos = start_pos[0] + line_offset, start_pos[1] diff --git a/jedi/parser/pgen2/pgen.py b/jedi/parser/pgen2/pgen.py index fa2742dd..ff19a545 100644 --- a/jedi/parser/pgen2/pgen.py +++ b/jedi/parser/pgen2/pgen.py @@ -12,18 +12,13 @@ from jedi.parser import tokenize class ParserGenerator(object): - def __init__(self, filename, stream=None): - close_stream = None - if stream is None: - stream = open(filename) - close_stream = stream.close + def __init__(self, filename): + with open(filename) as f: + code = f.read() self.filename = filename - self.stream = stream - self.generator = tokenize.generate_tokens(stream.readline) + self.generator = tokenize.source_tokens(code) self.gettoken() # Initialize lookahead self.dfas, self.startsymbol = self.parse() - if close_stream is not None: - close_stream() self.first = {} # map from symbol name to set of tokens self.addfirstsets() diff --git a/jedi/parser/tokenize.py b/jedi/parser/tokenize.py index d54f698b..e7fbee7b 100644 --- a/jedi/parser/tokenize.py +++ b/jedi/parser/tokenize.py @@ -14,7 +14,6 @@ from __future__ import absolute_import import string import re from collections import namedtuple -from io import StringIO import itertools as _itertools from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap, @@ -207,12 +206,11 @@ class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])): def source_tokens(source, use_exact_op_types=False): """Generate tokens from a the source code (string).""" - source = source - readline = StringIO(source).readline - return generate_tokens(readline, use_exact_op_types) + lines = splitlines(source, keepends=True) + return generate_tokens(lines, use_exact_op_types) -def generate_tokens(readline, use_exact_op_types=False): +def generate_tokens(lines, use_exact_op_types=False): """ A heavily modified Python standard library tokenizer. @@ -222,7 +220,6 @@ def generate_tokens(readline, use_exact_op_types=False): """ paren_level = 0 # count parentheses indents = [0] - lnum = 0 max = 0 numchars = '0123456789' contstr = '' @@ -234,16 +231,7 @@ def generate_tokens(readline, use_exact_op_types=False): new_line = True prefix = '' # Should never be required, but here for safety additional_prefix = '' - while True: # loop over lines in stream - line = readline() # readline returns empty when finished. See StringIO - if not line: - if contstr: - yield TokenInfo(ERRORTOKEN, contstr, contstr_start, prefix) - if contstr.endswith('\n'): - new_line = True - break - - lnum += 1 + for lnum, line in enumerate(lines, 1): # loop over lines in stream pos, max = 0, len(line) if contstr: # continued string @@ -359,10 +347,12 @@ def generate_tokens(readline, use_exact_op_types=False): typ = OP yield TokenInfo(typ, token, spos, prefix) - if new_line or additional_prefix[-1:] == '\n': - end_pos = lnum + 1, 0 - else: - end_pos = lnum, max + if contstr: + yield TokenInfo(ERRORTOKEN, contstr, contstr_start, prefix) + if contstr.endswith('\n'): + new_line = True + + end_pos = lnum, max # As the last position we just take the maximally possible position. We # remove -1 for the last new line. for indent in indents[1:]: diff --git a/test/test_parser/test_tokenize.py b/test/test_parser/test_tokenize.py index 72020a54..cd94e91f 100644 --- a/test/test_parser/test_tokenize.py +++ b/test/test_parser/test_tokenize.py @@ -1,18 +1,19 @@ # -*- coding: utf-8 # This file contains Unicode characters. -from io import StringIO from textwrap import dedent from jedi._compatibility import u, is_py3, py_version -from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT +from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT, ERRORTOKEN, ENDMARKER from jedi.parser import ParserWithRecovery, load_grammar, tokenize +from jedi.common import splitlines +from jedi.parser.tokenize import TokenInfo from ..helpers import unittest def _get_token_list(string): - io = StringIO(u(string)) - return list(tokenize.generate_tokens(io.readline)) + return list(tokenize.source_tokens(string)) + class TokenTest(unittest.TestCase): def test_end_pos_one_line(self): @@ -35,8 +36,7 @@ class TokenTest(unittest.TestCase): def test_simple_no_whitespace(self): # Test a simple one line string, no preceding whitespace simple_docstring = u('"""simple one line docstring"""') - simple_docstring_io = StringIO(simple_docstring) - tokens = tokenize.generate_tokens(simple_docstring_io.readline) + tokens = tokenize.source_tokens(simple_docstring) token_list = list(tokens) _, value, _, prefix = token_list[0] assert prefix == '' @@ -45,8 +45,7 @@ class TokenTest(unittest.TestCase): def test_simple_with_whitespace(self): # Test a simple one line string with preceding whitespace and newline simple_docstring = u(' """simple one line docstring""" \r\n') - simple_docstring_io = StringIO(simple_docstring) - tokens = tokenize.generate_tokens(simple_docstring_io.readline) + tokens = tokenize.source_tokens(simple_docstring) token_list = list(tokens) assert token_list[0][0] == INDENT typ, value, start_pos, prefix = token_list[1] @@ -65,8 +64,7 @@ class TokenTest(unittest.TestCase): if x > 0: print(True) ''')) - fundef_io = StringIO(fundef) - tokens = tokenize.generate_tokens(fundef_io.readline) + tokens = tokenize.source_tokens(fundef) token_list = list(tokens) for _, value, _, prefix in token_list: if value == 'test_whitespace': @@ -85,10 +83,8 @@ class TokenTest(unittest.TestCase): def test_tokenize_multiline_I(self): # Make sure multiline string having newlines have the end marker on the # next line - from jedi.parser.tokenize import TokenInfo, ERRORTOKEN, ENDMARKER fundef = u('''""""\n''') - fundef_io = StringIO(fundef) - tokens = tokenize.generate_tokens(fundef_io.readline) + tokens = tokenize.source_tokens(fundef) token_list = list(tokens) assert token_list == [TokenInfo(ERRORTOKEN, '""""\n', (1, 0), ''), TokenInfo(ENDMARKER , '', (2, 0), '')] @@ -96,10 +92,8 @@ class TokenTest(unittest.TestCase): def test_tokenize_multiline_II(self): # Make sure multiline string having no newlines have the end marker on # same line - from jedi.parser.tokenize import TokenInfo, ERRORTOKEN, ENDMARKER fundef = u('''""""''') - fundef_io = StringIO(fundef) - tokens = tokenize.generate_tokens(fundef_io.readline) + tokens = tokenize.source_tokens(fundef) token_list = list(tokens) assert token_list == [TokenInfo(ERRORTOKEN, '""""', (1, 0), ''), TokenInfo(ENDMARKER, '', (1, 4), '')] @@ -107,10 +101,8 @@ class TokenTest(unittest.TestCase): def test_tokenize_multiline_III(self): # Make sure multiline string having newlines have the end marker on the # next line even if several newline - from jedi.parser.tokenize import TokenInfo, ERRORTOKEN, ENDMARKER fundef = u('''""""\n\n''') - fundef_io = StringIO(fundef) - tokens = tokenize.generate_tokens(fundef_io.readline) + tokens = tokenize.source_tokens(fundef) token_list = list(tokens) assert token_list == [TokenInfo(ERRORTOKEN, '""""\n\n', (1, 0), ''), TokenInfo(ENDMARKER, '', (3, 0), '')] @@ -120,8 +112,7 @@ class TokenTest(unittest.TestCase): def 我あφ(): pass ''')) - fundef_io = StringIO(fundef) - tokens = tokenize.generate_tokens(fundef_io.readline) + tokens = tokenize.source_tokens(fundef) token_list = list(tokens) unicode_token = token_list[1] if is_py3: @@ -206,3 +197,17 @@ def test_error_literal(): assert error_token.string == '"""' assert endmarker.type == tokenize.ENDMARKER assert endmarker.prefix == '' + + +def test_endmarker_end_pos(): + def check(code): + tokens = _get_token_list(code) + lines = splitlines(code) + assert tokens[-1].end_pos == (len(lines), len(lines[-1])) + + check('#c') + check('#c\n') + check('a\n') + check('a') + check(r'a\\n') + check('a\\')