forked from VimPlug/jedi
Update the tokenizer to include f literals and underscores. Need tests still.
This commit is contained in:
@@ -12,11 +12,12 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
is_py3 = sys.version_info[0] >= 3
|
is_py3 = sys.version_info.major >= 3
|
||||||
is_py33 = is_py3 and sys.version_info.minor >= 3
|
is_py33 = is_py3 and sys.version_info.minor >= 3
|
||||||
is_py34 = is_py3 and sys.version_info.minor >= 4
|
is_py34 = is_py3 and sys.version_info.minor >= 4
|
||||||
is_py35 = is_py3 and sys.version_info.minor >= 5
|
is_py35 = is_py3 and sys.version_info.minor >= 5
|
||||||
is_py26 = not is_py3 and sys.version_info[1] < 7
|
is_py26 = not is_py3 and sys.version_info[1] < 7
|
||||||
|
py_version = int(str(sys.version_info.major) + str(sys.version_info.minor))
|
||||||
|
|
||||||
|
|
||||||
class DummyFile(object):
|
class DummyFile(object):
|
||||||
|
|||||||
@@ -15,10 +15,11 @@ import string
|
|||||||
import re
|
import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
import itertools as _itertools
|
||||||
|
|
||||||
from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
|
from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
|
||||||
NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
|
NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
|
||||||
from jedi._compatibility import is_py3
|
from jedi._compatibility import is_py3, py_version
|
||||||
from jedi.common import splitlines
|
from jedi.common import splitlines
|
||||||
|
|
||||||
|
|
||||||
@@ -37,121 +38,127 @@ COMMENT = N_TOKENS
|
|||||||
tok_name[COMMENT] = 'COMMENT'
|
tok_name[COMMENT] = 'COMMENT'
|
||||||
|
|
||||||
|
|
||||||
def group(*choices):
|
def group(*choices, **kwargs):
|
||||||
return '(' + '|'.join(choices) + ')'
|
capture = kwargs.pop('capture', False) # Python 2, arrghhhhh :(
|
||||||
|
assert not kwargs
|
||||||
|
|
||||||
|
start = '('
|
||||||
|
if not capture:
|
||||||
|
start += '?:'
|
||||||
|
return start + '|'.join(choices) + ')'
|
||||||
|
|
||||||
|
def any(*choices):
|
||||||
|
return group(*choices) + '*'
|
||||||
|
|
||||||
def maybe(*choices):
|
def maybe(*choices):
|
||||||
return group(*choices) + '?'
|
return group(*choices) + '?'
|
||||||
|
|
||||||
|
|
||||||
# Note: we use unicode matching for names ("\w") but ascii matching for
|
# Note: we use unicode matching for names ("\w") but ascii matching for
|
||||||
# number literals.
|
# number literals.
|
||||||
whitespace = r'[ \f\t]*'
|
Whitespace = r'[ \f\t]*'
|
||||||
comment = r'#[^\r\n]*'
|
Comment = r'#[^\r\n]*'
|
||||||
name = r'\w+'
|
Name = r'\w+'
|
||||||
|
|
||||||
hex_number = r'0[xX][0-9a-fA-F]+'
|
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
|
||||||
bin_number = r'0[bB][01]+'
|
Binnumber = r'0[bB](?:_?[01])+'
|
||||||
if is_py3:
|
if is_py3:
|
||||||
oct_number = r'0[oO][0-7]+'
|
Octnumber = r'0[oO](?:_?[0-7])+'
|
||||||
else:
|
else:
|
||||||
oct_number = '0[0-7]+'
|
Octnumber = '0[0-7]+'
|
||||||
dec_number = r'(?:0+|[1-9][0-9]*)'
|
|
||||||
int_number = group(hex_number, bin_number, oct_number, dec_number)
|
|
||||||
exponent = r'[eE][-+]?[0-9]+'
|
|
||||||
point_float = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(exponent)
|
|
||||||
Expfloat = r'[0-9]+' + exponent
|
|
||||||
float_number = group(point_float, Expfloat)
|
|
||||||
imag_number = group(r'[0-9]+[jJ]', float_number + r'[jJ]')
|
|
||||||
number = group(imag_number, float_number, int_number)
|
|
||||||
|
|
||||||
# Tail end of ' string.
|
Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
|
||||||
single = r"[^'\\]*(?:\\.[^'\\]*)*'"
|
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
|
||||||
# Tail end of " string.
|
Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
|
||||||
double = r'[^"\\]*(?:\\.[^"\\]*)*"'
|
Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
|
||||||
# Tail end of ''' string.
|
r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
|
||||||
single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
|
Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
|
||||||
# Tail end of """ string.
|
Floatnumber = group(Pointfloat, Expfloat)
|
||||||
double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
|
Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
|
||||||
triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
|
Number = group(Imagnumber, Floatnumber, Intnumber)
|
||||||
# Single-line ' or " string.
|
|
||||||
|
|
||||||
# Because of leftmost-then-longest match semantics, be sure to put the
|
|
||||||
# longest operators first (e.g., if = came before ==, == would get
|
|
||||||
# recognized as two instances of =).
|
|
||||||
operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
|
|
||||||
r"//=?", r"->",
|
|
||||||
r"[+\-*@/%&|^=<>]=?",
|
|
||||||
r"~")
|
|
||||||
|
|
||||||
bracket = '[][(){}]'
|
|
||||||
special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
|
|
||||||
funny = group(operator, bracket, special)
|
|
||||||
|
|
||||||
# First (or only) line of ' or " string.
|
|
||||||
cont_str = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
|
|
||||||
group("'", r'\\\r?\n'),
|
|
||||||
r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
|
|
||||||
group('"', r'\\\r?\n'))
|
|
||||||
pseudo_extras = group(r'\\\r?\n', comment, triple)
|
|
||||||
pseudo_token = group(whitespace) + \
|
|
||||||
group(pseudo_extras, number, funny, cont_str, name)
|
|
||||||
|
|
||||||
|
# Return the empty string, plus all of the valid string prefixes.
|
||||||
|
def _all_string_prefixes():
|
||||||
|
# The valid string prefixes. Only contain the lower case versions,
|
||||||
|
# and don't contain any permuations (include 'fr', but not
|
||||||
|
# 'rf'). The various permutations will be generated.
|
||||||
|
_valid_string_prefixes = ['b', 'r', 'u', 'br']
|
||||||
|
if py_version >= 36:
|
||||||
|
_valid_string_prefixes += ['f', 'fr']
|
||||||
|
# if we add binary f-strings, add: ['fb', 'fbr']
|
||||||
|
result = set([''])
|
||||||
|
for prefix in _valid_string_prefixes:
|
||||||
|
for t in _itertools.permutations(prefix):
|
||||||
|
# create a list with upper and lower versions of each
|
||||||
|
# character
|
||||||
|
for u in _itertools.product(*[(c, c.upper()) for c in t]):
|
||||||
|
result.add(''.join(u))
|
||||||
|
return result
|
||||||
|
|
||||||
def _compile(expr):
|
def _compile(expr):
|
||||||
return re.compile(expr, re.UNICODE)
|
return re.compile(expr, re.UNICODE)
|
||||||
|
|
||||||
|
# Note that since _all_string_prefixes includes the empty string,
|
||||||
|
# StringPrefix can be the empty string (making it optional).
|
||||||
|
StringPrefix = group(*_all_string_prefixes())
|
||||||
|
|
||||||
pseudoprog, single3prog, double3prog = map(
|
# Tail end of ' string.
|
||||||
_compile, (pseudo_token, single3, double3))
|
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
|
||||||
|
# Tail end of " string.
|
||||||
|
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
|
||||||
|
# Tail end of ''' string.
|
||||||
|
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
|
||||||
|
# Tail end of """ string.
|
||||||
|
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
|
||||||
|
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
|
||||||
|
|
||||||
endprogs = {"'": _compile(single), '"': _compile(double),
|
# Because of leftmost-then-longest match semantics, be sure to put the
|
||||||
"'''": single3prog, '"""': double3prog,
|
# longest operators first (e.g., if = came before ==, == would get
|
||||||
"r'''": single3prog, 'r"""': double3prog,
|
# recognized as two instances of =).
|
||||||
"b'''": single3prog, 'b"""': double3prog,
|
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
|
||||||
"u'''": single3prog, 'u"""': double3prog,
|
r"//=?", r"->",
|
||||||
"R'''": single3prog, 'R"""': double3prog,
|
r"[+\-*/%&@|^=<>]=?",
|
||||||
"B'''": single3prog, 'B"""': double3prog,
|
r"~")
|
||||||
"U'''": single3prog, 'U"""': double3prog,
|
|
||||||
"br'''": single3prog, 'br"""': double3prog,
|
|
||||||
"bR'''": single3prog, 'bR"""': double3prog,
|
|
||||||
"Br'''": single3prog, 'Br"""': double3prog,
|
|
||||||
"BR'''": single3prog, 'BR"""': double3prog,
|
|
||||||
"ur'''": single3prog, 'ur"""': double3prog,
|
|
||||||
"uR'''": single3prog, 'uR"""': double3prog,
|
|
||||||
"Ur'''": single3prog, 'Ur"""': double3prog,
|
|
||||||
"UR'''": single3prog, 'UR"""': double3prog,
|
|
||||||
'r': None, 'R': None, 'b': None, 'B': None}
|
|
||||||
|
|
||||||
triple_quoted = {}
|
Bracket = '[][(){}]'
|
||||||
for t in ("'''", '"""',
|
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
|
||||||
"r'''", 'r"""', "R'''", 'R"""',
|
Funny = group(Operator, Bracket, Special)
|
||||||
"b'''", 'b"""', "B'''", 'B"""',
|
|
||||||
"u'''", 'u"""', "U'''", 'U"""',
|
|
||||||
"br'''", 'br"""', "Br'''", 'Br"""',
|
|
||||||
"bR'''", 'bR"""', "BR'''", 'BR"""',
|
|
||||||
"ur'''", 'ur"""', "Ur'''", 'Ur"""',
|
|
||||||
"uR'''", 'uR"""', "UR'''", 'UR"""'):
|
|
||||||
triple_quoted[t] = t
|
|
||||||
single_quoted = {}
|
|
||||||
for t in ("'", '"',
|
|
||||||
"r'", 'r"', "R'", 'R"',
|
|
||||||
"b'", 'b"', "B'", 'B"',
|
|
||||||
"u'", 'u"', "U'", 'U"',
|
|
||||||
"br'", 'br"', "Br'", 'Br"',
|
|
||||||
"bR'", 'bR"', "BR'", 'BR"',
|
|
||||||
"ur'", 'ur"', "Ur'", 'Ur"',
|
|
||||||
"uR'", 'uR"', "UR'", 'UR"'):
|
|
||||||
single_quoted[t] = t
|
|
||||||
|
|
||||||
del _compile
|
PlainToken = group(Number, Funny, Name, capture=True)
|
||||||
|
|
||||||
|
# First (or only) line of ' or " string.
|
||||||
|
ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
|
||||||
|
group("'", r'\\\r?\n'),
|
||||||
|
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
|
||||||
|
group('"', r'\\\r?\n'))
|
||||||
|
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
|
||||||
|
PseudoToken = group(Whitespace, capture=True) + \
|
||||||
|
group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
|
||||||
|
|
||||||
|
# For a given string prefix plus quotes, endpats maps it to a regex
|
||||||
|
# to match the remainder of that string. _prefix can be empty, for
|
||||||
|
# a normal single or triple quoted string (with no prefix).
|
||||||
|
endpats = {}
|
||||||
|
for _prefix in _all_string_prefixes():
|
||||||
|
endpats[_prefix + "'"] = _compile(Single)
|
||||||
|
endpats[_prefix + '"'] = _compile(Double)
|
||||||
|
endpats[_prefix + "'''"] = _compile(Single3)
|
||||||
|
endpats[_prefix + '"""'] = _compile(Double3)
|
||||||
|
|
||||||
|
# A set of all of the single and triple quoted string prefixes,
|
||||||
|
# including the opening quotes.
|
||||||
|
single_quoted = set()
|
||||||
|
triple_quoted = set()
|
||||||
|
for t in _all_string_prefixes():
|
||||||
|
for u in (t + '"', t + "'"):
|
||||||
|
single_quoted.add(u)
|
||||||
|
for u in (t + '"""', t + "'''"):
|
||||||
|
triple_quoted.add(u)
|
||||||
|
|
||||||
tabsize = 8
|
|
||||||
|
|
||||||
# TODO add with?
|
# TODO add with?
|
||||||
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
|
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
|
||||||
'finally', 'while', 'return')
|
'finally', 'while', 'return')
|
||||||
|
pseudo_token_compiled = _compile(PseudoToken)
|
||||||
|
|
||||||
|
|
||||||
class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
|
class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
|
||||||
@@ -228,7 +235,7 @@ def generate_tokens(readline, use_exact_op_types=False):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
while pos < max:
|
while pos < max:
|
||||||
pseudomatch = pseudoprog.match(line, pos)
|
pseudomatch = pseudo_token_compiled.match(line, pos)
|
||||||
if not pseudomatch: # scan for tokens
|
if not pseudomatch: # scan for tokens
|
||||||
txt = line[pos]
|
txt = line[pos]
|
||||||
if line[pos] in '"\'':
|
if line[pos] in '"\'':
|
||||||
@@ -272,7 +279,7 @@ def generate_tokens(readline, use_exact_op_types=False):
|
|||||||
assert not token.endswith("\n")
|
assert not token.endswith("\n")
|
||||||
additional_prefix = prefix + token
|
additional_prefix = prefix + token
|
||||||
elif token in triple_quoted:
|
elif token in triple_quoted:
|
||||||
endprog = endprogs[token]
|
endprog = endpats[token]
|
||||||
endmatch = endprog.match(line, pos)
|
endmatch = endprog.match(line, pos)
|
||||||
if endmatch: # all on one line
|
if endmatch: # all on one line
|
||||||
pos = endmatch.end(0)
|
pos = endmatch.end(0)
|
||||||
@@ -288,8 +295,8 @@ def generate_tokens(readline, use_exact_op_types=False):
|
|||||||
token[:3] in single_quoted:
|
token[:3] in single_quoted:
|
||||||
if token[-1] == '\n': # continued string
|
if token[-1] == '\n': # continued string
|
||||||
contstr_start = lnum, start
|
contstr_start = lnum, start
|
||||||
endprog = (endprogs.get(initial) or endprogs.get(token[1])
|
endprog = (endpats.get(initial) or endpats.get(token[1])
|
||||||
or endprogs.get(token[2]))
|
or endpats.get(token[2]))
|
||||||
contstr = line[start:]
|
contstr = line[start:]
|
||||||
contline = line
|
contline = line
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from textwrap import dedent
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from jedi._compatibility import u, is_py3
|
from jedi._compatibility import u, is_py3, py_version
|
||||||
from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT
|
from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT
|
||||||
from jedi.parser import ParserWithRecovery, load_grammar, tokenize
|
from jedi.parser import ParserWithRecovery, load_grammar, tokenize
|
||||||
|
|
||||||
@@ -129,24 +129,32 @@ def test_ur_literals():
|
|||||||
Decided to parse `u''` literals regardless of Python version. This makes
|
Decided to parse `u''` literals regardless of Python version. This makes
|
||||||
probably sense:
|
probably sense:
|
||||||
|
|
||||||
- Python 3.2 doesn't support it and is still supported by Jedi, but might
|
- Python 3+ doesn't support it, but it doesn't hurt
|
||||||
not be. While this is incorrect, it's just incorrect for one "old" and in
|
not be. While this is incorrect, it's just incorrect for one "old" and in
|
||||||
the future not very important version.
|
the future not very important version.
|
||||||
- All the other Python versions work very well with it.
|
- All the other Python versions work very well with it.
|
||||||
"""
|
"""
|
||||||
def check(literal):
|
def check(literal, is_literal=True):
|
||||||
io = StringIO(u(literal))
|
io = StringIO(u(literal))
|
||||||
tokens = tokenize.generate_tokens(io.readline)
|
tokens = tokenize.generate_tokens(io.readline)
|
||||||
token_list = list(tokens)
|
token_list = list(tokens)
|
||||||
typ, result_literal, _, _ = token_list[0]
|
typ, result_literal, _, _ = token_list[0]
|
||||||
assert typ == STRING
|
if is_literal:
|
||||||
assert result_literal == literal
|
assert typ == STRING
|
||||||
|
assert result_literal == literal
|
||||||
|
else:
|
||||||
|
assert typ == NAME
|
||||||
|
|
||||||
check('u""')
|
check('u""')
|
||||||
check('ur""')
|
check('ur""', is_literal=not is_py3)
|
||||||
check('Ur""')
|
check('Ur""', is_literal=not is_py3)
|
||||||
check('UR""')
|
check('UR""', is_literal=not is_py3)
|
||||||
check('bR""')
|
check('bR""')
|
||||||
# Must be in the right order.
|
# Starting with Python 3.3 this ordering is also possible, but we just
|
||||||
with pytest.raises(AssertionError):
|
# enable it for all versions. It doesn't hurt.
|
||||||
check('Rb""')
|
check('Rb""')
|
||||||
|
# Starting with Python 3.6 format strings where introduced.
|
||||||
|
check('fr""', is_literal=py_version >= 36)
|
||||||
|
check('rF""', is_literal=py_version >= 36)
|
||||||
|
check('f""', is_literal=py_version >= 36)
|
||||||
|
check('F""', is_literal=py_version >= 36)
|
||||||
|
|||||||
Reference in New Issue
Block a user