forked from VimPlug/jedi
Make the tokenizer a generator.
This commit is contained in:
@@ -356,8 +356,7 @@ class DiffParser(object):
|
|||||||
is_first_token = True
|
is_first_token = True
|
||||||
omitted_first_indent = False
|
omitted_first_indent = False
|
||||||
indents = []
|
indents = []
|
||||||
l = iter(lines)
|
tokens = generate_tokens(lines, use_exact_op_types=True)
|
||||||
tokens = generate_tokens(lambda: next(l, ''), use_exact_op_types=True)
|
|
||||||
stack = self._active_parser.pgen_parser.stack
|
stack = self._active_parser.pgen_parser.stack
|
||||||
for typ, string, start_pos, prefix in tokens:
|
for typ, string, start_pos, prefix in tokens:
|
||||||
start_pos = start_pos[0] + line_offset, start_pos[1]
|
start_pos = start_pos[0] + line_offset, start_pos[1]
|
||||||
|
|||||||
@@ -12,18 +12,13 @@ from jedi.parser import tokenize
|
|||||||
|
|
||||||
|
|
||||||
class ParserGenerator(object):
|
class ParserGenerator(object):
|
||||||
def __init__(self, filename, stream=None):
|
def __init__(self, filename):
|
||||||
close_stream = None
|
with open(filename) as f:
|
||||||
if stream is None:
|
code = f.read()
|
||||||
stream = open(filename)
|
|
||||||
close_stream = stream.close
|
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.stream = stream
|
self.generator = tokenize.source_tokens(code)
|
||||||
self.generator = tokenize.generate_tokens(stream.readline)
|
|
||||||
self.gettoken() # Initialize lookahead
|
self.gettoken() # Initialize lookahead
|
||||||
self.dfas, self.startsymbol = self.parse()
|
self.dfas, self.startsymbol = self.parse()
|
||||||
if close_stream is not None:
|
|
||||||
close_stream()
|
|
||||||
self.first = {} # map from symbol name to set of tokens
|
self.first = {} # map from symbol name to set of tokens
|
||||||
self.addfirstsets()
|
self.addfirstsets()
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ from __future__ import absolute_import
|
|||||||
import string
|
import string
|
||||||
import re
|
import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from io import StringIO
|
|
||||||
import itertools as _itertools
|
import itertools as _itertools
|
||||||
|
|
||||||
from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
|
from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
|
||||||
@@ -207,12 +206,11 @@ class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
|
|||||||
|
|
||||||
def source_tokens(source, use_exact_op_types=False):
|
def source_tokens(source, use_exact_op_types=False):
|
||||||
"""Generate tokens from a the source code (string)."""
|
"""Generate tokens from a the source code (string)."""
|
||||||
source = source
|
lines = splitlines(source, keepends=True)
|
||||||
readline = StringIO(source).readline
|
return generate_tokens(lines, use_exact_op_types)
|
||||||
return generate_tokens(readline, use_exact_op_types)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_tokens(readline, use_exact_op_types=False):
|
def generate_tokens(lines, use_exact_op_types=False):
|
||||||
"""
|
"""
|
||||||
A heavily modified Python standard library tokenizer.
|
A heavily modified Python standard library tokenizer.
|
||||||
|
|
||||||
@@ -222,7 +220,6 @@ def generate_tokens(readline, use_exact_op_types=False):
|
|||||||
"""
|
"""
|
||||||
paren_level = 0 # count parentheses
|
paren_level = 0 # count parentheses
|
||||||
indents = [0]
|
indents = [0]
|
||||||
lnum = 0
|
|
||||||
max = 0
|
max = 0
|
||||||
numchars = '0123456789'
|
numchars = '0123456789'
|
||||||
contstr = ''
|
contstr = ''
|
||||||
@@ -234,16 +231,7 @@ def generate_tokens(readline, use_exact_op_types=False):
|
|||||||
new_line = True
|
new_line = True
|
||||||
prefix = '' # Should never be required, but here for safety
|
prefix = '' # Should never be required, but here for safety
|
||||||
additional_prefix = ''
|
additional_prefix = ''
|
||||||
while True: # loop over lines in stream
|
for lnum, line in enumerate(lines, 1): # loop over lines in stream
|
||||||
line = readline() # readline returns empty when finished. See StringIO
|
|
||||||
if not line:
|
|
||||||
if contstr:
|
|
||||||
yield TokenInfo(ERRORTOKEN, contstr, contstr_start, prefix)
|
|
||||||
if contstr.endswith('\n'):
|
|
||||||
new_line = True
|
|
||||||
break
|
|
||||||
|
|
||||||
lnum += 1
|
|
||||||
pos, max = 0, len(line)
|
pos, max = 0, len(line)
|
||||||
|
|
||||||
if contstr: # continued string
|
if contstr: # continued string
|
||||||
@@ -359,10 +347,12 @@ def generate_tokens(readline, use_exact_op_types=False):
|
|||||||
typ = OP
|
typ = OP
|
||||||
yield TokenInfo(typ, token, spos, prefix)
|
yield TokenInfo(typ, token, spos, prefix)
|
||||||
|
|
||||||
if new_line or additional_prefix[-1:] == '\n':
|
if contstr:
|
||||||
end_pos = lnum + 1, 0
|
yield TokenInfo(ERRORTOKEN, contstr, contstr_start, prefix)
|
||||||
else:
|
if contstr.endswith('\n'):
|
||||||
end_pos = lnum, max
|
new_line = True
|
||||||
|
|
||||||
|
end_pos = lnum, max
|
||||||
# As the last position we just take the maximally possible position. We
|
# As the last position we just take the maximally possible position. We
|
||||||
# remove -1 for the last new line.
|
# remove -1 for the last new line.
|
||||||
for indent in indents[1:]:
|
for indent in indents[1:]:
|
||||||
|
|||||||
@@ -1,18 +1,19 @@
|
|||||||
# -*- coding: utf-8 # This file contains Unicode characters.
|
# -*- coding: utf-8 # This file contains Unicode characters.
|
||||||
|
|
||||||
from io import StringIO
|
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
|
|
||||||
from jedi._compatibility import u, is_py3, py_version
|
from jedi._compatibility import u, is_py3, py_version
|
||||||
from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT
|
from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT, ERRORTOKEN, ENDMARKER
|
||||||
from jedi.parser import ParserWithRecovery, load_grammar, tokenize
|
from jedi.parser import ParserWithRecovery, load_grammar, tokenize
|
||||||
|
from jedi.common import splitlines
|
||||||
|
from jedi.parser.tokenize import TokenInfo
|
||||||
|
|
||||||
|
|
||||||
from ..helpers import unittest
|
from ..helpers import unittest
|
||||||
|
|
||||||
def _get_token_list(string):
|
def _get_token_list(string):
|
||||||
io = StringIO(u(string))
|
return list(tokenize.source_tokens(string))
|
||||||
return list(tokenize.generate_tokens(io.readline))
|
|
||||||
|
|
||||||
class TokenTest(unittest.TestCase):
|
class TokenTest(unittest.TestCase):
|
||||||
def test_end_pos_one_line(self):
|
def test_end_pos_one_line(self):
|
||||||
@@ -35,8 +36,7 @@ class TokenTest(unittest.TestCase):
|
|||||||
def test_simple_no_whitespace(self):
|
def test_simple_no_whitespace(self):
|
||||||
# Test a simple one line string, no preceding whitespace
|
# Test a simple one line string, no preceding whitespace
|
||||||
simple_docstring = u('"""simple one line docstring"""')
|
simple_docstring = u('"""simple one line docstring"""')
|
||||||
simple_docstring_io = StringIO(simple_docstring)
|
tokens = tokenize.source_tokens(simple_docstring)
|
||||||
tokens = tokenize.generate_tokens(simple_docstring_io.readline)
|
|
||||||
token_list = list(tokens)
|
token_list = list(tokens)
|
||||||
_, value, _, prefix = token_list[0]
|
_, value, _, prefix = token_list[0]
|
||||||
assert prefix == ''
|
assert prefix == ''
|
||||||
@@ -45,8 +45,7 @@ class TokenTest(unittest.TestCase):
|
|||||||
def test_simple_with_whitespace(self):
|
def test_simple_with_whitespace(self):
|
||||||
# Test a simple one line string with preceding whitespace and newline
|
# Test a simple one line string with preceding whitespace and newline
|
||||||
simple_docstring = u(' """simple one line docstring""" \r\n')
|
simple_docstring = u(' """simple one line docstring""" \r\n')
|
||||||
simple_docstring_io = StringIO(simple_docstring)
|
tokens = tokenize.source_tokens(simple_docstring)
|
||||||
tokens = tokenize.generate_tokens(simple_docstring_io.readline)
|
|
||||||
token_list = list(tokens)
|
token_list = list(tokens)
|
||||||
assert token_list[0][0] == INDENT
|
assert token_list[0][0] == INDENT
|
||||||
typ, value, start_pos, prefix = token_list[1]
|
typ, value, start_pos, prefix = token_list[1]
|
||||||
@@ -65,8 +64,7 @@ class TokenTest(unittest.TestCase):
|
|||||||
if x > 0:
|
if x > 0:
|
||||||
print(True)
|
print(True)
|
||||||
'''))
|
'''))
|
||||||
fundef_io = StringIO(fundef)
|
tokens = tokenize.source_tokens(fundef)
|
||||||
tokens = tokenize.generate_tokens(fundef_io.readline)
|
|
||||||
token_list = list(tokens)
|
token_list = list(tokens)
|
||||||
for _, value, _, prefix in token_list:
|
for _, value, _, prefix in token_list:
|
||||||
if value == 'test_whitespace':
|
if value == 'test_whitespace':
|
||||||
@@ -85,10 +83,8 @@ class TokenTest(unittest.TestCase):
|
|||||||
def test_tokenize_multiline_I(self):
|
def test_tokenize_multiline_I(self):
|
||||||
# Make sure multiline string having newlines have the end marker on the
|
# Make sure multiline string having newlines have the end marker on the
|
||||||
# next line
|
# next line
|
||||||
from jedi.parser.tokenize import TokenInfo, ERRORTOKEN, ENDMARKER
|
|
||||||
fundef = u('''""""\n''')
|
fundef = u('''""""\n''')
|
||||||
fundef_io = StringIO(fundef)
|
tokens = tokenize.source_tokens(fundef)
|
||||||
tokens = tokenize.generate_tokens(fundef_io.readline)
|
|
||||||
token_list = list(tokens)
|
token_list = list(tokens)
|
||||||
assert token_list == [TokenInfo(ERRORTOKEN, '""""\n', (1, 0), ''),
|
assert token_list == [TokenInfo(ERRORTOKEN, '""""\n', (1, 0), ''),
|
||||||
TokenInfo(ENDMARKER , '', (2, 0), '')]
|
TokenInfo(ENDMARKER , '', (2, 0), '')]
|
||||||
@@ -96,10 +92,8 @@ class TokenTest(unittest.TestCase):
|
|||||||
def test_tokenize_multiline_II(self):
|
def test_tokenize_multiline_II(self):
|
||||||
# Make sure multiline string having no newlines have the end marker on
|
# Make sure multiline string having no newlines have the end marker on
|
||||||
# same line
|
# same line
|
||||||
from jedi.parser.tokenize import TokenInfo, ERRORTOKEN, ENDMARKER
|
|
||||||
fundef = u('''""""''')
|
fundef = u('''""""''')
|
||||||
fundef_io = StringIO(fundef)
|
tokens = tokenize.source_tokens(fundef)
|
||||||
tokens = tokenize.generate_tokens(fundef_io.readline)
|
|
||||||
token_list = list(tokens)
|
token_list = list(tokens)
|
||||||
assert token_list == [TokenInfo(ERRORTOKEN, '""""', (1, 0), ''),
|
assert token_list == [TokenInfo(ERRORTOKEN, '""""', (1, 0), ''),
|
||||||
TokenInfo(ENDMARKER, '', (1, 4), '')]
|
TokenInfo(ENDMARKER, '', (1, 4), '')]
|
||||||
@@ -107,10 +101,8 @@ class TokenTest(unittest.TestCase):
|
|||||||
def test_tokenize_multiline_III(self):
|
def test_tokenize_multiline_III(self):
|
||||||
# Make sure multiline string having newlines have the end marker on the
|
# Make sure multiline string having newlines have the end marker on the
|
||||||
# next line even if several newline
|
# next line even if several newline
|
||||||
from jedi.parser.tokenize import TokenInfo, ERRORTOKEN, ENDMARKER
|
|
||||||
fundef = u('''""""\n\n''')
|
fundef = u('''""""\n\n''')
|
||||||
fundef_io = StringIO(fundef)
|
tokens = tokenize.source_tokens(fundef)
|
||||||
tokens = tokenize.generate_tokens(fundef_io.readline)
|
|
||||||
token_list = list(tokens)
|
token_list = list(tokens)
|
||||||
assert token_list == [TokenInfo(ERRORTOKEN, '""""\n\n', (1, 0), ''),
|
assert token_list == [TokenInfo(ERRORTOKEN, '""""\n\n', (1, 0), ''),
|
||||||
TokenInfo(ENDMARKER, '', (3, 0), '')]
|
TokenInfo(ENDMARKER, '', (3, 0), '')]
|
||||||
@@ -120,8 +112,7 @@ class TokenTest(unittest.TestCase):
|
|||||||
def 我あφ():
|
def 我あφ():
|
||||||
pass
|
pass
|
||||||
'''))
|
'''))
|
||||||
fundef_io = StringIO(fundef)
|
tokens = tokenize.source_tokens(fundef)
|
||||||
tokens = tokenize.generate_tokens(fundef_io.readline)
|
|
||||||
token_list = list(tokens)
|
token_list = list(tokens)
|
||||||
unicode_token = token_list[1]
|
unicode_token = token_list[1]
|
||||||
if is_py3:
|
if is_py3:
|
||||||
@@ -206,3 +197,17 @@ def test_error_literal():
|
|||||||
assert error_token.string == '"""'
|
assert error_token.string == '"""'
|
||||||
assert endmarker.type == tokenize.ENDMARKER
|
assert endmarker.type == tokenize.ENDMARKER
|
||||||
assert endmarker.prefix == ''
|
assert endmarker.prefix == ''
|
||||||
|
|
||||||
|
|
||||||
|
def test_endmarker_end_pos():
|
||||||
|
def check(code):
|
||||||
|
tokens = _get_token_list(code)
|
||||||
|
lines = splitlines(code)
|
||||||
|
assert tokens[-1].end_pos == (len(lines), len(lines[-1]))
|
||||||
|
|
||||||
|
check('#c')
|
||||||
|
check('#c\n')
|
||||||
|
check('a\n')
|
||||||
|
check('a')
|
||||||
|
check(r'a\\n')
|
||||||
|
check('a\\')
|
||||||
|
|||||||
Reference in New Issue
Block a user