1
0
forked from VimPlug/jedi

Update the tokenizer to include f literals and underscores. Need tests still.

This commit is contained in:
Dave Halter
2017-01-08 16:03:45 +01:00
parent 3f09f3a304
commit 00a9f1ec0a
3 changed files with 123 additions and 107 deletions

View File

@@ -12,11 +12,12 @@ try:
except ImportError: except ImportError:
pass pass
is_py3 = sys.version_info[0] >= 3 is_py3 = sys.version_info.major >= 3
is_py33 = is_py3 and sys.version_info.minor >= 3 is_py33 = is_py3 and sys.version_info.minor >= 3
is_py34 = is_py3 and sys.version_info.minor >= 4 is_py34 = is_py3 and sys.version_info.minor >= 4
is_py35 = is_py3 and sys.version_info.minor >= 5 is_py35 = is_py3 and sys.version_info.minor >= 5
is_py26 = not is_py3 and sys.version_info[1] < 7 is_py26 = not is_py3 and sys.version_info[1] < 7
py_version = int(str(sys.version_info.major) + str(sys.version_info.minor))
class DummyFile(object): class DummyFile(object):

View File

@@ -15,10 +15,11 @@ import string
import re import re
from collections import namedtuple from collections import namedtuple
from io import StringIO from io import StringIO
import itertools as _itertools
from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap, from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT) NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
from jedi._compatibility import is_py3 from jedi._compatibility import is_py3, py_version
from jedi.common import splitlines from jedi.common import splitlines
@@ -37,121 +38,127 @@ COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT' tok_name[COMMENT] = 'COMMENT'
def group(*choices): def group(*choices, **kwargs):
return '(' + '|'.join(choices) + ')' capture = kwargs.pop('capture', False) # Python 2, arrghhhhh :(
assert not kwargs
start = '('
if not capture:
start += '?:'
return start + '|'.join(choices) + ')'
def any(*choices):
return group(*choices) + '*'
def maybe(*choices): def maybe(*choices):
return group(*choices) + '?' return group(*choices) + '?'
# Note: we use unicode matching for names ("\w") but ascii matching for # Note: we use unicode matching for names ("\w") but ascii matching for
# number literals. # number literals.
whitespace = r'[ \f\t]*' Whitespace = r'[ \f\t]*'
comment = r'#[^\r\n]*' Comment = r'#[^\r\n]*'
name = r'\w+' Name = r'\w+'
hex_number = r'0[xX][0-9a-fA-F]+' Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
bin_number = r'0[bB][01]+' Binnumber = r'0[bB](?:_?[01])+'
if is_py3: if is_py3:
oct_number = r'0[oO][0-7]+' Octnumber = r'0[oO](?:_?[0-7])+'
else: else:
oct_number = '0[0-7]+' Octnumber = '0[0-7]+'
dec_number = r'(?:0+|[1-9][0-9]*)'
int_number = group(hex_number, bin_number, oct_number, dec_number)
exponent = r'[eE][-+]?[0-9]+'
point_float = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(exponent)
Expfloat = r'[0-9]+' + exponent
float_number = group(point_float, Expfloat)
imag_number = group(r'[0-9]+[jJ]', float_number + r'[jJ]')
number = group(imag_number, float_number, int_number)
# Tail end of ' string. Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
single = r"[^'\\]*(?:\\.[^'\\]*)*'" Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
# Tail end of " string. Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
double = r'[^"\\]*(?:\\.[^"\\]*)*"' Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
# Tail end of ''' string. r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
# Tail end of """ string. Floatnumber = group(Pointfloat, Expfloat)
double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""') Number = group(Imagnumber, Floatnumber, Intnumber)
# Single-line ' or " string.
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
r"//=?", r"->",
r"[+\-*@/%&|^=<>]=?",
r"~")
bracket = '[][(){}]'
special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
funny = group(operator, bracket, special)
# First (or only) line of ' or " string.
cont_str = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
group("'", r'\\\r?\n'),
r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n'))
pseudo_extras = group(r'\\\r?\n', comment, triple)
pseudo_token = group(whitespace) + \
group(pseudo_extras, number, funny, cont_str, name)
# Return the empty string, plus all of the valid string prefixes.
def _all_string_prefixes():
# The valid string prefixes. Only contain the lower case versions,
# and don't contain any permuations (include 'fr', but not
# 'rf'). The various permutations will be generated.
_valid_string_prefixes = ['b', 'r', 'u', 'br']
if py_version >= 36:
_valid_string_prefixes += ['f', 'fr']
# if we add binary f-strings, add: ['fb', 'fbr']
result = set([''])
for prefix in _valid_string_prefixes:
for t in _itertools.permutations(prefix):
# create a list with upper and lower versions of each
# character
for u in _itertools.product(*[(c, c.upper()) for c in t]):
result.add(''.join(u))
return result
def _compile(expr): def _compile(expr):
return re.compile(expr, re.UNICODE) return re.compile(expr, re.UNICODE)
# Note that since _all_string_prefixes includes the empty string,
# StringPrefix can be the empty string (making it optional).
StringPrefix = group(*_all_string_prefixes())
pseudoprog, single3prog, double3prog = map( # Tail end of ' string.
_compile, (pseudo_token, single3, double3)) Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
endprogs = {"'": _compile(single), '"': _compile(double), # Because of leftmost-then-longest match semantics, be sure to put the
"'''": single3prog, '"""': double3prog, # longest operators first (e.g., if = came before ==, == would get
"r'''": single3prog, 'r"""': double3prog, # recognized as two instances of =).
"b'''": single3prog, 'b"""': double3prog, Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
"u'''": single3prog, 'u"""': double3prog, r"//=?", r"->",
"R'''": single3prog, 'R"""': double3prog, r"[+\-*/%&@|^=<>]=?",
"B'''": single3prog, 'B"""': double3prog, r"~")
"U'''": single3prog, 'U"""': double3prog,
"br'''": single3prog, 'br"""': double3prog,
"bR'''": single3prog, 'bR"""': double3prog,
"Br'''": single3prog, 'Br"""': double3prog,
"BR'''": single3prog, 'BR"""': double3prog,
"ur'''": single3prog, 'ur"""': double3prog,
"uR'''": single3prog, 'uR"""': double3prog,
"Ur'''": single3prog, 'Ur"""': double3prog,
"UR'''": single3prog, 'UR"""': double3prog,
'r': None, 'R': None, 'b': None, 'B': None}
triple_quoted = {} Bracket = '[][(){}]'
for t in ("'''", '"""', Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
"r'''", 'r"""', "R'''", 'R"""', Funny = group(Operator, Bracket, Special)
"b'''", 'b"""', "B'''", 'B"""',
"u'''", 'u"""', "U'''", 'U"""',
"br'''", 'br"""', "Br'''", 'Br"""',
"bR'''", 'bR"""', "BR'''", 'BR"""',
"ur'''", 'ur"""', "Ur'''", 'Ur"""',
"uR'''", 'uR"""', "UR'''", 'UR"""'):
triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
"r'", 'r"', "R'", 'R"',
"b'", 'b"', "B'", 'B"',
"u'", 'u"', "U'", 'U"',
"br'", 'br"', "Br'", 'Br"',
"bR'", 'bR"', "BR'", 'BR"',
"ur'", 'ur"', "Ur'", 'Ur"',
"uR'", 'uR"', "UR'", 'UR"'):
single_quoted[t] = t
del _compile PlainToken = group(Number, Funny, Name, capture=True)
# First (or only) line of ' or " string.
ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
group("'", r'\\\r?\n'),
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n'))
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
PseudoToken = group(Whitespace, capture=True) + \
group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
# For a given string prefix plus quotes, endpats maps it to a regex
# to match the remainder of that string. _prefix can be empty, for
# a normal single or triple quoted string (with no prefix).
endpats = {}
for _prefix in _all_string_prefixes():
endpats[_prefix + "'"] = _compile(Single)
endpats[_prefix + '"'] = _compile(Double)
endpats[_prefix + "'''"] = _compile(Single3)
endpats[_prefix + '"""'] = _compile(Double3)
# A set of all of the single and triple quoted string prefixes,
# including the opening quotes.
single_quoted = set()
triple_quoted = set()
for t in _all_string_prefixes():
for u in (t + '"', t + "'"):
single_quoted.add(u)
for u in (t + '"""', t + "'''"):
triple_quoted.add(u)
tabsize = 8
# TODO add with? # TODO add with?
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
'finally', 'while', 'return') 'finally', 'while', 'return')
pseudo_token_compiled = _compile(PseudoToken)
class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])): class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
@@ -228,7 +235,7 @@ def generate_tokens(readline, use_exact_op_types=False):
continue continue
while pos < max: while pos < max:
pseudomatch = pseudoprog.match(line, pos) pseudomatch = pseudo_token_compiled.match(line, pos)
if not pseudomatch: # scan for tokens if not pseudomatch: # scan for tokens
txt = line[pos] txt = line[pos]
if line[pos] in '"\'': if line[pos] in '"\'':
@@ -272,7 +279,7 @@ def generate_tokens(readline, use_exact_op_types=False):
assert not token.endswith("\n") assert not token.endswith("\n")
additional_prefix = prefix + token additional_prefix = prefix + token
elif token in triple_quoted: elif token in triple_quoted:
endprog = endprogs[token] endprog = endpats[token]
endmatch = endprog.match(line, pos) endmatch = endprog.match(line, pos)
if endmatch: # all on one line if endmatch: # all on one line
pos = endmatch.end(0) pos = endmatch.end(0)
@@ -288,8 +295,8 @@ def generate_tokens(readline, use_exact_op_types=False):
token[:3] in single_quoted: token[:3] in single_quoted:
if token[-1] == '\n': # continued string if token[-1] == '\n': # continued string
contstr_start = lnum, start contstr_start = lnum, start
endprog = (endprogs.get(initial) or endprogs.get(token[1]) endprog = (endpats.get(initial) or endpats.get(token[1])
or endprogs.get(token[2])) or endpats.get(token[2]))
contstr = line[start:] contstr = line[start:]
contline = line contline = line
break break

View File

@@ -5,7 +5,7 @@ from textwrap import dedent
import pytest import pytest
from jedi._compatibility import u, is_py3 from jedi._compatibility import u, is_py3, py_version
from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT
from jedi.parser import ParserWithRecovery, load_grammar, tokenize from jedi.parser import ParserWithRecovery, load_grammar, tokenize
@@ -129,24 +129,32 @@ def test_ur_literals():
Decided to parse `u''` literals regardless of Python version. This makes Decided to parse `u''` literals regardless of Python version. This makes
probably sense: probably sense:
- Python 3.2 doesn't support it and is still supported by Jedi, but might - Python 3+ doesn't support it, but it doesn't hurt
not be. While this is incorrect, it's just incorrect for one "old" and in not be. While this is incorrect, it's just incorrect for one "old" and in
the future not very important version. the future not very important version.
- All the other Python versions work very well with it. - All the other Python versions work very well with it.
""" """
def check(literal): def check(literal, is_literal=True):
io = StringIO(u(literal)) io = StringIO(u(literal))
tokens = tokenize.generate_tokens(io.readline) tokens = tokenize.generate_tokens(io.readline)
token_list = list(tokens) token_list = list(tokens)
typ, result_literal, _, _ = token_list[0] typ, result_literal, _, _ = token_list[0]
assert typ == STRING if is_literal:
assert result_literal == literal assert typ == STRING
assert result_literal == literal
else:
assert typ == NAME
check('u""') check('u""')
check('ur""') check('ur""', is_literal=not is_py3)
check('Ur""') check('Ur""', is_literal=not is_py3)
check('UR""') check('UR""', is_literal=not is_py3)
check('bR""') check('bR""')
# Must be in the right order. # Starting with Python 3.3 this ordering is also possible, but we just
with pytest.raises(AssertionError): # enable it for all versions. It doesn't hurt.
check('Rb""') check('Rb""')
# Starting with Python 3.6 format strings where introduced.
check('fr""', is_literal=py_version >= 36)
check('rF""', is_literal=py_version >= 36)
check('f""', is_literal=py_version >= 36)
check('F""', is_literal=py_version >= 36)