diff --git a/jedi/_compatibility.py b/jedi/_compatibility.py index 4710e253..4bfb1d7a 100644 --- a/jedi/_compatibility.py +++ b/jedi/_compatibility.py @@ -12,11 +12,12 @@ try: except ImportError: pass -is_py3 = sys.version_info[0] >= 3 +is_py3 = sys.version_info.major >= 3 is_py33 = is_py3 and sys.version_info.minor >= 3 is_py34 = is_py3 and sys.version_info.minor >= 4 is_py35 = is_py3 and sys.version_info.minor >= 5 is_py26 = not is_py3 and sys.version_info[1] < 7 +py_version = int(str(sys.version_info.major) + str(sys.version_info.minor)) class DummyFile(object): diff --git a/jedi/parser/tokenize.py b/jedi/parser/tokenize.py index 82e5d612..a2d789a9 100644 --- a/jedi/parser/tokenize.py +++ b/jedi/parser/tokenize.py @@ -15,10 +15,11 @@ import string import re from collections import namedtuple from io import StringIO +import itertools as _itertools from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap, NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT) -from jedi._compatibility import is_py3 +from jedi._compatibility import is_py3, py_version from jedi.common import splitlines @@ -37,121 +38,127 @@ COMMENT = N_TOKENS tok_name[COMMENT] = 'COMMENT' -def group(*choices): - return '(' + '|'.join(choices) + ')' +def group(*choices, **kwargs): + capture = kwargs.pop('capture', False) # Python 2, arrghhhhh :( + assert not kwargs + start = '(' + if not capture: + start += '?:' + return start + '|'.join(choices) + ')' + +def any(*choices): + return group(*choices) + '*' def maybe(*choices): return group(*choices) + '?' - # Note: we use unicode matching for names ("\w") but ascii matching for # number literals. -whitespace = r'[ \f\t]*' -comment = r'#[^\r\n]*' -name = r'\w+' +Whitespace = r'[ \f\t]*' +Comment = r'#[^\r\n]*' +Name = r'\w+' -hex_number = r'0[xX][0-9a-fA-F]+' -bin_number = r'0[bB][01]+' +Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' +Binnumber = r'0[bB](?:_?[01])+' if is_py3: - oct_number = r'0[oO][0-7]+' + Octnumber = r'0[oO](?:_?[0-7])+' else: - oct_number = '0[0-7]+' -dec_number = r'(?:0+|[1-9][0-9]*)' -int_number = group(hex_number, bin_number, oct_number, dec_number) -exponent = r'[eE][-+]?[0-9]+' -point_float = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(exponent) -Expfloat = r'[0-9]+' + exponent -float_number = group(point_float, Expfloat) -imag_number = group(r'[0-9]+[jJ]', float_number + r'[jJ]') -number = group(imag_number, float_number, int_number) + Octnumber = '0[0-7]+' -# Tail end of ' string. -single = r"[^'\\]*(?:\\.[^'\\]*)*'" -# Tail end of " string. -double = r'[^"\\]*(?:\\.[^"\\]*)*"' -# Tail end of ''' string. -single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" -# Tail end of """ string. -double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""') -# Single-line ' or " string. - -# Because of leftmost-then-longest match semantics, be sure to put the -# longest operators first (e.g., if = came before ==, == would get -# recognized as two instances of =). -operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", - r"//=?", r"->", - r"[+\-*@/%&|^=<>]=?", - r"~") - -bracket = '[][(){}]' -special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') -funny = group(operator, bracket, special) - -# First (or only) line of ' or " string. -cont_str = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + - group("'", r'\\\r?\n'), - r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + - group('"', r'\\\r?\n')) -pseudo_extras = group(r'\\\r?\n', comment, triple) -pseudo_token = group(whitespace) + \ - group(pseudo_extras, number, funny, cont_str, name) +Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' +Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) +Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' +Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', + r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) +Expfloat = r'[0-9](?:_?[0-9])*' + Exponent +Floatnumber = group(Pointfloat, Expfloat) +Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') +Number = group(Imagnumber, Floatnumber, Intnumber) +# Return the empty string, plus all of the valid string prefixes. +def _all_string_prefixes(): + # The valid string prefixes. Only contain the lower case versions, + # and don't contain any permuations (include 'fr', but not + # 'rf'). The various permutations will be generated. + _valid_string_prefixes = ['b', 'r', 'u', 'br'] + if py_version >= 36: + _valid_string_prefixes += ['f', 'fr'] + # if we add binary f-strings, add: ['fb', 'fbr'] + result = set(['']) + for prefix in _valid_string_prefixes: + for t in _itertools.permutations(prefix): + # create a list with upper and lower versions of each + # character + for u in _itertools.product(*[(c, c.upper()) for c in t]): + result.add(''.join(u)) + return result def _compile(expr): return re.compile(expr, re.UNICODE) +# Note that since _all_string_prefixes includes the empty string, +# StringPrefix can be the empty string (making it optional). +StringPrefix = group(*_all_string_prefixes()) -pseudoprog, single3prog, double3prog = map( - _compile, (pseudo_token, single3, double3)) +# Tail end of ' string. +Single = r"[^'\\]*(?:\\.[^'\\]*)*'" +# Tail end of " string. +Double = r'[^"\\]*(?:\\.[^"\\]*)*"' +# Tail end of ''' string. +Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" +# Tail end of """ string. +Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' +Triple = group(StringPrefix + "'''", StringPrefix + '"""') -endprogs = {"'": _compile(single), '"': _compile(double), - "'''": single3prog, '"""': double3prog, - "r'''": single3prog, 'r"""': double3prog, - "b'''": single3prog, 'b"""': double3prog, - "u'''": single3prog, 'u"""': double3prog, - "R'''": single3prog, 'R"""': double3prog, - "B'''": single3prog, 'B"""': double3prog, - "U'''": single3prog, 'U"""': double3prog, - "br'''": single3prog, 'br"""': double3prog, - "bR'''": single3prog, 'bR"""': double3prog, - "Br'''": single3prog, 'Br"""': double3prog, - "BR'''": single3prog, 'BR"""': double3prog, - "ur'''": single3prog, 'ur"""': double3prog, - "uR'''": single3prog, 'uR"""': double3prog, - "Ur'''": single3prog, 'Ur"""': double3prog, - "UR'''": single3prog, 'UR"""': double3prog, - 'r': None, 'R': None, 'b': None, 'B': None} +# Because of leftmost-then-longest match semantics, be sure to put the +# longest operators first (e.g., if = came before ==, == would get +# recognized as two instances of =). +Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", + r"//=?", r"->", + r"[+\-*/%&@|^=<>]=?", + r"~") -triple_quoted = {} -for t in ("'''", '"""', - "r'''", 'r"""', "R'''", 'R"""', - "b'''", 'b"""', "B'''", 'B"""', - "u'''", 'u"""', "U'''", 'U"""', - "br'''", 'br"""', "Br'''", 'Br"""', - "bR'''", 'bR"""', "BR'''", 'BR"""', - "ur'''", 'ur"""', "Ur'''", 'Ur"""', - "uR'''", 'uR"""', "UR'''", 'UR"""'): - triple_quoted[t] = t -single_quoted = {} -for t in ("'", '"', - "r'", 'r"', "R'", 'R"', - "b'", 'b"', "B'", 'B"', - "u'", 'u"', "U'", 'U"', - "br'", 'br"', "Br'", 'Br"', - "bR'", 'bR"', "BR'", 'BR"', - "ur'", 'ur"', "Ur'", 'Ur"', - "uR'", 'uR"', "UR'", 'UR"'): - single_quoted[t] = t +Bracket = '[][(){}]' +Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') +Funny = group(Operator, Bracket, Special) -del _compile +PlainToken = group(Number, Funny, Name, capture=True) + +# First (or only) line of ' or " string. +ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + + group("'", r'\\\r?\n'), + StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + + group('"', r'\\\r?\n')) +PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) +PseudoToken = group(Whitespace, capture=True) + \ + group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) + +# For a given string prefix plus quotes, endpats maps it to a regex +# to match the remainder of that string. _prefix can be empty, for +# a normal single or triple quoted string (with no prefix). +endpats = {} +for _prefix in _all_string_prefixes(): + endpats[_prefix + "'"] = _compile(Single) + endpats[_prefix + '"'] = _compile(Double) + endpats[_prefix + "'''"] = _compile(Single3) + endpats[_prefix + '"""'] = _compile(Double3) + +# A set of all of the single and triple quoted string prefixes, +# including the opening quotes. +single_quoted = set() +triple_quoted = set() +for t in _all_string_prefixes(): + for u in (t + '"', t + "'"): + single_quoted.add(u) + for u in (t + '"""', t + "'''"): + triple_quoted.add(u) -tabsize = 8 # TODO add with? ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', 'finally', 'while', 'return') +pseudo_token_compiled = _compile(PseudoToken) class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])): @@ -228,7 +235,7 @@ def generate_tokens(readline, use_exact_op_types=False): continue while pos < max: - pseudomatch = pseudoprog.match(line, pos) + pseudomatch = pseudo_token_compiled.match(line, pos) if not pseudomatch: # scan for tokens txt = line[pos] if line[pos] in '"\'': @@ -272,7 +279,7 @@ def generate_tokens(readline, use_exact_op_types=False): assert not token.endswith("\n") additional_prefix = prefix + token elif token in triple_quoted: - endprog = endprogs[token] + endprog = endpats[token] endmatch = endprog.match(line, pos) if endmatch: # all on one line pos = endmatch.end(0) @@ -288,8 +295,8 @@ def generate_tokens(readline, use_exact_op_types=False): token[:3] in single_quoted: if token[-1] == '\n': # continued string contstr_start = lnum, start - endprog = (endprogs.get(initial) or endprogs.get(token[1]) - or endprogs.get(token[2])) + endprog = (endpats.get(initial) or endpats.get(token[1]) + or endpats.get(token[2])) contstr = line[start:] contline = line break diff --git a/test/test_parser/test_tokenize.py b/test/test_parser/test_tokenize.py index a42271ff..1cbc4f3b 100644 --- a/test/test_parser/test_tokenize.py +++ b/test/test_parser/test_tokenize.py @@ -5,7 +5,7 @@ from textwrap import dedent import pytest -from jedi._compatibility import u, is_py3 +from jedi._compatibility import u, is_py3, py_version from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT from jedi.parser import ParserWithRecovery, load_grammar, tokenize @@ -129,24 +129,32 @@ def test_ur_literals(): Decided to parse `u''` literals regardless of Python version. This makes probably sense: - - Python 3.2 doesn't support it and is still supported by Jedi, but might + - Python 3+ doesn't support it, but it doesn't hurt not be. While this is incorrect, it's just incorrect for one "old" and in the future not very important version. - All the other Python versions work very well with it. """ - def check(literal): + def check(literal, is_literal=True): io = StringIO(u(literal)) tokens = tokenize.generate_tokens(io.readline) token_list = list(tokens) typ, result_literal, _, _ = token_list[0] - assert typ == STRING - assert result_literal == literal + if is_literal: + assert typ == STRING + assert result_literal == literal + else: + assert typ == NAME check('u""') - check('ur""') - check('Ur""') - check('UR""') + check('ur""', is_literal=not is_py3) + check('Ur""', is_literal=not is_py3) + check('UR""', is_literal=not is_py3) check('bR""') - # Must be in the right order. - with pytest.raises(AssertionError): - check('Rb""') + # Starting with Python 3.3 this ordering is also possible, but we just + # enable it for all versions. It doesn't hurt. + check('Rb""') + # Starting with Python 3.6 format strings where introduced. + check('fr""', is_literal=py_version >= 36) + check('rF""', is_literal=py_version >= 36) + check('f""', is_literal=py_version >= 36) + check('F""', is_literal=py_version >= 36)