1
0
forked from VimPlug/jedi

Make the tokenizer a generator.

This commit is contained in:
Dave Halter
2017-03-09 18:53:09 +01:00
parent 989e4bac89
commit c7a74e6d1c
4 changed files with 41 additions and 52 deletions

View File

@@ -356,8 +356,7 @@ class DiffParser(object):
is_first_token = True is_first_token = True
omitted_first_indent = False omitted_first_indent = False
indents = [] indents = []
l = iter(lines) tokens = generate_tokens(lines, use_exact_op_types=True)
tokens = generate_tokens(lambda: next(l, ''), use_exact_op_types=True)
stack = self._active_parser.pgen_parser.stack stack = self._active_parser.pgen_parser.stack
for typ, string, start_pos, prefix in tokens: for typ, string, start_pos, prefix in tokens:
start_pos = start_pos[0] + line_offset, start_pos[1] start_pos = start_pos[0] + line_offset, start_pos[1]

View File

@@ -12,18 +12,13 @@ from jedi.parser import tokenize
class ParserGenerator(object): class ParserGenerator(object):
def __init__(self, filename, stream=None): def __init__(self, filename):
close_stream = None with open(filename) as f:
if stream is None: code = f.read()
stream = open(filename)
close_stream = stream.close
self.filename = filename self.filename = filename
self.stream = stream self.generator = tokenize.source_tokens(code)
self.generator = tokenize.generate_tokens(stream.readline)
self.gettoken() # Initialize lookahead self.gettoken() # Initialize lookahead
self.dfas, self.startsymbol = self.parse() self.dfas, self.startsymbol = self.parse()
if close_stream is not None:
close_stream()
self.first = {} # map from symbol name to set of tokens self.first = {} # map from symbol name to set of tokens
self.addfirstsets() self.addfirstsets()

View File

@@ -14,7 +14,6 @@ from __future__ import absolute_import
import string import string
import re import re
from collections import namedtuple from collections import namedtuple
from io import StringIO
import itertools as _itertools import itertools as _itertools
from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap, from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
@@ -207,12 +206,11 @@ class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
def source_tokens(source, use_exact_op_types=False): def source_tokens(source, use_exact_op_types=False):
"""Generate tokens from a the source code (string).""" """Generate tokens from a the source code (string)."""
source = source lines = splitlines(source, keepends=True)
readline = StringIO(source).readline return generate_tokens(lines, use_exact_op_types)
return generate_tokens(readline, use_exact_op_types)
def generate_tokens(readline, use_exact_op_types=False): def generate_tokens(lines, use_exact_op_types=False):
""" """
A heavily modified Python standard library tokenizer. A heavily modified Python standard library tokenizer.
@@ -222,7 +220,6 @@ def generate_tokens(readline, use_exact_op_types=False):
""" """
paren_level = 0 # count parentheses paren_level = 0 # count parentheses
indents = [0] indents = [0]
lnum = 0
max = 0 max = 0
numchars = '0123456789' numchars = '0123456789'
contstr = '' contstr = ''
@@ -234,16 +231,7 @@ def generate_tokens(readline, use_exact_op_types=False):
new_line = True new_line = True
prefix = '' # Should never be required, but here for safety prefix = '' # Should never be required, but here for safety
additional_prefix = '' additional_prefix = ''
while True: # loop over lines in stream for lnum, line in enumerate(lines, 1): # loop over lines in stream
line = readline() # readline returns empty when finished. See StringIO
if not line:
if contstr:
yield TokenInfo(ERRORTOKEN, contstr, contstr_start, prefix)
if contstr.endswith('\n'):
new_line = True
break
lnum += 1
pos, max = 0, len(line) pos, max = 0, len(line)
if contstr: # continued string if contstr: # continued string
@@ -359,10 +347,12 @@ def generate_tokens(readline, use_exact_op_types=False):
typ = OP typ = OP
yield TokenInfo(typ, token, spos, prefix) yield TokenInfo(typ, token, spos, prefix)
if new_line or additional_prefix[-1:] == '\n': if contstr:
end_pos = lnum + 1, 0 yield TokenInfo(ERRORTOKEN, contstr, contstr_start, prefix)
else: if contstr.endswith('\n'):
end_pos = lnum, max new_line = True
end_pos = lnum, max
# As the last position we just take the maximally possible position. We # As the last position we just take the maximally possible position. We
# remove -1 for the last new line. # remove -1 for the last new line.
for indent in indents[1:]: for indent in indents[1:]:

View File

@@ -1,18 +1,19 @@
# -*- coding: utf-8 # This file contains Unicode characters. # -*- coding: utf-8 # This file contains Unicode characters.
from io import StringIO
from textwrap import dedent from textwrap import dedent
from jedi._compatibility import u, is_py3, py_version from jedi._compatibility import u, is_py3, py_version
from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT, ERRORTOKEN, ENDMARKER
from jedi.parser import ParserWithRecovery, load_grammar, tokenize from jedi.parser import ParserWithRecovery, load_grammar, tokenize
from jedi.common import splitlines
from jedi.parser.tokenize import TokenInfo
from ..helpers import unittest from ..helpers import unittest
def _get_token_list(string): def _get_token_list(string):
io = StringIO(u(string)) return list(tokenize.source_tokens(string))
return list(tokenize.generate_tokens(io.readline))
class TokenTest(unittest.TestCase): class TokenTest(unittest.TestCase):
def test_end_pos_one_line(self): def test_end_pos_one_line(self):
@@ -35,8 +36,7 @@ class TokenTest(unittest.TestCase):
def test_simple_no_whitespace(self): def test_simple_no_whitespace(self):
# Test a simple one line string, no preceding whitespace # Test a simple one line string, no preceding whitespace
simple_docstring = u('"""simple one line docstring"""') simple_docstring = u('"""simple one line docstring"""')
simple_docstring_io = StringIO(simple_docstring) tokens = tokenize.source_tokens(simple_docstring)
tokens = tokenize.generate_tokens(simple_docstring_io.readline)
token_list = list(tokens) token_list = list(tokens)
_, value, _, prefix = token_list[0] _, value, _, prefix = token_list[0]
assert prefix == '' assert prefix == ''
@@ -45,8 +45,7 @@ class TokenTest(unittest.TestCase):
def test_simple_with_whitespace(self): def test_simple_with_whitespace(self):
# Test a simple one line string with preceding whitespace and newline # Test a simple one line string with preceding whitespace and newline
simple_docstring = u(' """simple one line docstring""" \r\n') simple_docstring = u(' """simple one line docstring""" \r\n')
simple_docstring_io = StringIO(simple_docstring) tokens = tokenize.source_tokens(simple_docstring)
tokens = tokenize.generate_tokens(simple_docstring_io.readline)
token_list = list(tokens) token_list = list(tokens)
assert token_list[0][0] == INDENT assert token_list[0][0] == INDENT
typ, value, start_pos, prefix = token_list[1] typ, value, start_pos, prefix = token_list[1]
@@ -65,8 +64,7 @@ class TokenTest(unittest.TestCase):
if x > 0: if x > 0:
print(True) print(True)
''')) '''))
fundef_io = StringIO(fundef) tokens = tokenize.source_tokens(fundef)
tokens = tokenize.generate_tokens(fundef_io.readline)
token_list = list(tokens) token_list = list(tokens)
for _, value, _, prefix in token_list: for _, value, _, prefix in token_list:
if value == 'test_whitespace': if value == 'test_whitespace':
@@ -85,10 +83,8 @@ class TokenTest(unittest.TestCase):
def test_tokenize_multiline_I(self): def test_tokenize_multiline_I(self):
# Make sure multiline string having newlines have the end marker on the # Make sure multiline string having newlines have the end marker on the
# next line # next line
from jedi.parser.tokenize import TokenInfo, ERRORTOKEN, ENDMARKER
fundef = u('''""""\n''') fundef = u('''""""\n''')
fundef_io = StringIO(fundef) tokens = tokenize.source_tokens(fundef)
tokens = tokenize.generate_tokens(fundef_io.readline)
token_list = list(tokens) token_list = list(tokens)
assert token_list == [TokenInfo(ERRORTOKEN, '""""\n', (1, 0), ''), assert token_list == [TokenInfo(ERRORTOKEN, '""""\n', (1, 0), ''),
TokenInfo(ENDMARKER , '', (2, 0), '')] TokenInfo(ENDMARKER , '', (2, 0), '')]
@@ -96,10 +92,8 @@ class TokenTest(unittest.TestCase):
def test_tokenize_multiline_II(self): def test_tokenize_multiline_II(self):
# Make sure multiline string having no newlines have the end marker on # Make sure multiline string having no newlines have the end marker on
# same line # same line
from jedi.parser.tokenize import TokenInfo, ERRORTOKEN, ENDMARKER
fundef = u('''""""''') fundef = u('''""""''')
fundef_io = StringIO(fundef) tokens = tokenize.source_tokens(fundef)
tokens = tokenize.generate_tokens(fundef_io.readline)
token_list = list(tokens) token_list = list(tokens)
assert token_list == [TokenInfo(ERRORTOKEN, '""""', (1, 0), ''), assert token_list == [TokenInfo(ERRORTOKEN, '""""', (1, 0), ''),
TokenInfo(ENDMARKER, '', (1, 4), '')] TokenInfo(ENDMARKER, '', (1, 4), '')]
@@ -107,10 +101,8 @@ class TokenTest(unittest.TestCase):
def test_tokenize_multiline_III(self): def test_tokenize_multiline_III(self):
# Make sure multiline string having newlines have the end marker on the # Make sure multiline string having newlines have the end marker on the
# next line even if several newline # next line even if several newline
from jedi.parser.tokenize import TokenInfo, ERRORTOKEN, ENDMARKER
fundef = u('''""""\n\n''') fundef = u('''""""\n\n''')
fundef_io = StringIO(fundef) tokens = tokenize.source_tokens(fundef)
tokens = tokenize.generate_tokens(fundef_io.readline)
token_list = list(tokens) token_list = list(tokens)
assert token_list == [TokenInfo(ERRORTOKEN, '""""\n\n', (1, 0), ''), assert token_list == [TokenInfo(ERRORTOKEN, '""""\n\n', (1, 0), ''),
TokenInfo(ENDMARKER, '', (3, 0), '')] TokenInfo(ENDMARKER, '', (3, 0), '')]
@@ -120,8 +112,7 @@ class TokenTest(unittest.TestCase):
def 我あφ(): def 我あφ():
pass pass
''')) '''))
fundef_io = StringIO(fundef) tokens = tokenize.source_tokens(fundef)
tokens = tokenize.generate_tokens(fundef_io.readline)
token_list = list(tokens) token_list = list(tokens)
unicode_token = token_list[1] unicode_token = token_list[1]
if is_py3: if is_py3:
@@ -206,3 +197,17 @@ def test_error_literal():
assert error_token.string == '"""' assert error_token.string == '"""'
assert endmarker.type == tokenize.ENDMARKER assert endmarker.type == tokenize.ENDMARKER
assert endmarker.prefix == '' assert endmarker.prefix == ''
def test_endmarker_end_pos():
def check(code):
tokens = _get_token_list(code)
lines = splitlines(code)
assert tokens[-1].end_pos == (len(lines), len(lines[-1]))
check('#c')
check('#c\n')
check('a\n')
check('a')
check(r'a\\n')
check('a\\')