Update the tokenizer to include f literals and underscores. Need tests still.

2017-01-08 16:03:45 +01:00
parent 3f09f3a304
commit 00a9f1ec0a
3 changed files with 123 additions and 107 deletions
--- a/jedi/_compatibility.py
+++ b/jedi/_compatibility.py
@@ -12,11 +12,12 @@ try:
 except ImportError:
    pass
-is_py3 = sys.version_info[0] >= 3
+is_py3 = sys.version_info.major >= 3
 is_py33 = is_py3 and sys.version_info.minor >= 3
 is_py34 = is_py3 and sys.version_info.minor >= 4
 is_py35 = is_py3 and sys.version_info.minor >= 5
 is_py26 = not is_py3 and sys.version_info[1] < 7
 py_version = int(str(sys.version_info.major) + str(sys.version_info.minor))
 class DummyFile(object):
--- a/jedi/parser/tokenize.py
+++ b/jedi/parser/tokenize.py
@@ -15,10 +15,11 @@ import string
 import re
 from collections import namedtuple
 from io import StringIO
 import itertools as _itertools
 from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
                               NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
-from jedi._compatibility import is_py3
+from jedi._compatibility import is_py3, py_version
 from jedi.common import splitlines
@@ -37,121 +38,127 @@ COMMENT = N_TOKENS
 tok_name[COMMENT] = 'COMMENT'
-def group(*choices):
+def group(*choices, **kwargs):
-    return '(' + '|'.join(choices) + ')'
+    capture = kwargs.pop('capture', False)  # Python 2, arrghhhhh :(
    assert not kwargs
    start = '('
    if not capture:
        start += '?:'
    return start + '|'.join(choices) + ')'
 def any(*choices):
    return group(*choices) + '*'
 def maybe(*choices):
    return group(*choices) + '?'
 # Note: we use unicode matching for names ("\w") but ascii matching for
 # number literals.
-whitespace = r'[ \f\t]*'
+Whitespace = r'[ \f\t]*'
-comment = r'#[^\r\n]*'
+Comment = r'#[^\r\n]*'
-name = r'\w+'
+Name = r'\w+'
-hex_number = r'0[xX][0-9a-fA-F]+'
+Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
-bin_number = r'0[bB][01]+'
+Binnumber = r'0[bB](?:_?[01])+'
 if is_py3:
-    oct_number = r'0[oO][0-7]+'
+    Octnumber = r'0[oO](?:_?[0-7])+'
 else:
-    oct_number = '0[0-7]+'
+    Octnumber = '0[0-7]+'
 dec_number = r'(?:0+|[1-9][0-9]*)'
 int_number = group(hex_number, bin_number, oct_number, dec_number)
 exponent = r'[eE][-+]?[0-9]+'
 point_float = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(exponent)
 Expfloat = r'[0-9]+' + exponent
 float_number = group(point_float, Expfloat)
 imag_number = group(r'[0-9]+[jJ]', float_number + r'[jJ]')
 number = group(imag_number, float_number, int_number)
-# Tail end of ' string.
+Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
-single = r"[^'\\]*(?:\\.[^'\\]*)*'"
+Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-# Tail end of " string.
+Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
-double = r'[^"\\]*(?:\\.[^"\\]*)*"'
+Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
-# Tail end of ''' string.
+                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
-single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
+Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
-# Tail end of """ string.
+Floatnumber = group(Pointfloat, Expfloat)
-double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
+Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
-triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
+Number = group(Imagnumber, Floatnumber, Intnumber)
 # Single-line ' or " string.
 # Because of leftmost-then-longest match semantics, be sure to put the
 # longest operators first (e.g., if = came before ==, == would get
 # recognized as two instances of =).
 operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
                 r"//=?", r"->",
                 r"[+\-*@/%&|^=<>]=?",
                 r"~")
 bracket = '[][(){}]'
 special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
 funny = group(operator, bracket, special)
 # First (or only) line of ' or " string.
 cont_str = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                 group("'", r'\\\r?\n'),
                 r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                 group('"', r'\\\r?\n'))
 pseudo_extras = group(r'\\\r?\n', comment, triple)
 pseudo_token = group(whitespace) + \
    group(pseudo_extras, number, funny, cont_str, name)
 # Return the empty string, plus all of the valid string prefixes.
 def _all_string_prefixes():
    # The valid string prefixes. Only contain the lower case versions,
    #  and don't contain any permuations (include 'fr', but not
    #  'rf'). The various permutations will be generated.
    _valid_string_prefixes = ['b', 'r', 'u', 'br']
    if py_version >= 36:
        _valid_string_prefixes += ['f', 'fr']
    # if we add binary f-strings, add: ['fb', 'fbr']
    result = set([''])
    for prefix in _valid_string_prefixes:
        for t in _itertools.permutations(prefix):
            # create a list with upper and lower versions of each
            #  character
            for u in _itertools.product(*[(c, c.upper()) for c in t]):
                result.add(''.join(u))
    return result
 def _compile(expr):
    return re.compile(expr, re.UNICODE)
 # Note that since _all_string_prefixes includes the empty string,
 #  StringPrefix can be the empty string (making it optional).
 StringPrefix = group(*_all_string_prefixes())
-pseudoprog, single3prog, double3prog = map(
+# Tail end of ' string.
-    _compile, (pseudo_token, single3, double3))
+Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 # Tail end of " string.
 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 # Tail end of ''' string.
 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 # Tail end of """ string.
 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 Triple = group(StringPrefix + "'''", StringPrefix + '"""')
-endprogs = {"'": _compile(single), '"': _compile(double),
+# Because of leftmost-then-longest match semantics, be sure to put the
-            "'''": single3prog, '"""': double3prog,
+# longest operators first (e.g., if = came before ==, == would get
-            "r'''": single3prog, 'r"""': double3prog,
+# recognized as two instances of =).
-            "b'''": single3prog, 'b"""': double3prog,
+Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
-            "u'''": single3prog, 'u"""': double3prog,
+                 r"//=?", r"->",
-            "R'''": single3prog, 'R"""': double3prog,
+                 r"[+\-*/%&@|^=<>]=?",
-            "B'''": single3prog, 'B"""': double3prog,
+                 r"~")
            "U'''": single3prog, 'U"""': double3prog,
            "br'''": single3prog, 'br"""': double3prog,
            "bR'''": single3prog, 'bR"""': double3prog,
            "Br'''": single3prog, 'Br"""': double3prog,
            "BR'''": single3prog, 'BR"""': double3prog,
            "ur'''": single3prog, 'ur"""': double3prog,
            "uR'''": single3prog, 'uR"""': double3prog,
            "Ur'''": single3prog, 'Ur"""': double3prog,
            "UR'''": single3prog, 'UR"""': double3prog,
            'r': None, 'R': None, 'b': None, 'B': None}
-triple_quoted = {}
+Bracket = '[][(){}]'
-for t in ("'''", '"""',
+Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-          "r'''", 'r"""', "R'''", 'R"""',
+Funny = group(Operator, Bracket, Special)
          "b'''", 'b"""', "B'''", 'B"""',
          "u'''", 'u"""', "U'''", 'U"""',
          "br'''", 'br"""', "Br'''", 'Br"""',
          "bR'''", 'bR"""', "BR'''", 'BR"""',
          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
          "uR'''", 'uR"""', "UR'''", 'UR"""'):
    triple_quoted[t] = t
 single_quoted = {}
 for t in ("'", '"',
          "r'", 'r"', "R'", 'R"',
          "b'", 'b"', "B'", 'B"',
          "u'", 'u"', "U'", 'U"',
          "br'", 'br"', "Br'", 'Br"',
          "bR'", 'bR"', "BR'", 'BR"',
          "ur'", 'ur"', "Ur'", 'Ur"',
          "uR'", 'uR"', "UR'", 'UR"'):
    single_quoted[t] = t
-del _compile
+PlainToken = group(Number, Funny, Name, capture=True)
 # First (or only) line of ' or " string.
 ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                group("'", r'\\\r?\n'),
                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                group('"', r'\\\r?\n'))
 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
 PseudoToken = group(Whitespace, capture=True) + \
    group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
 # For a given string prefix plus quotes, endpats maps it to a regex
 #  to match the remainder of that string. _prefix can be empty, for
 #  a normal single or triple quoted string (with no prefix).
 endpats = {}
 for _prefix in _all_string_prefixes():
    endpats[_prefix + "'"] = _compile(Single)
    endpats[_prefix + '"'] = _compile(Double)
    endpats[_prefix + "'''"] = _compile(Single3)
    endpats[_prefix + '"""'] = _compile(Double3)
 # A set of all of the single and triple quoted string prefixes,
 #  including the opening quotes.
 single_quoted = set()
 triple_quoted = set()
 for t in _all_string_prefixes():
    for u in (t + '"', t + "'"):
        single_quoted.add(u)
    for u in (t + '"""', t + "'''"):
        triple_quoted.add(u)
 tabsize = 8
 # TODO add with?
 ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
                       'finally', 'while', 'return')
 pseudo_token_compiled = _compile(PseudoToken)
 class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
@@ -228,7 +235,7 @@ def generate_tokens(readline, use_exact_op_types=False):
                continue
        while pos < max:
-            pseudomatch = pseudoprog.match(line, pos)
+            pseudomatch = pseudo_token_compiled.match(line, pos)
            if not pseudomatch:                             # scan for tokens
                txt = line[pos]
                if line[pos] in '"\'':
@@ -272,7 +279,7 @@ def generate_tokens(readline, use_exact_op_types=False):
                assert not token.endswith("\n")
                additional_prefix = prefix + token
            elif token in triple_quoted:
-                endprog = endprogs[token]
+                endprog = endpats[token]
                endmatch = endprog.match(line, pos)
                if endmatch:                                # all on one line
                    pos = endmatch.end(0)
@@ -288,8 +295,8 @@ def generate_tokens(readline, use_exact_op_types=False):
                    token[:3] in single_quoted:
                if token[-1] == '\n':                       # continued string
                    contstr_start = lnum, start
-                    endprog = (endprogs.get(initial) or endprogs.get(token[1])
+                    endprog = (endpats.get(initial) or endpats.get(token[1])
-                               or endprogs.get(token[2]))
+                               or endpats.get(token[2]))
                    contstr = line[start:]
                    contline = line
                    break
--- a/test/test_parser/test_tokenize.py
+++ b/test/test_parser/test_tokenize.py
@@ -5,7 +5,7 @@ from textwrap import dedent
 import pytest
-from jedi._compatibility import u, is_py3
+from jedi._compatibility import u, is_py3, py_version
 from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT
 from jedi.parser import ParserWithRecovery, load_grammar, tokenize
@@ -129,24 +129,32 @@ def test_ur_literals():
    Decided to parse `u''` literals regardless of Python version. This makes
    probably sense:
-    - Python 3.2 doesn't support it and is still supported by Jedi, but might
+    - Python 3+ doesn't support it, but it doesn't hurt
      not be. While this is incorrect, it's just incorrect for one "old" and in
      the future not very important version.
    - All the other Python versions work very well with it.
    """
-    def check(literal):
+    def check(literal, is_literal=True):
        io = StringIO(u(literal))
        tokens = tokenize.generate_tokens(io.readline)
        token_list = list(tokens)
        typ, result_literal, _, _ = token_list[0]
-        assert typ == STRING
+        if is_literal:
-        assert result_literal == literal
+            assert typ == STRING
            assert result_literal == literal
        else:
            assert typ == NAME
    check('u""')
-    check('ur""')
+    check('ur""', is_literal=not is_py3)
-    check('Ur""')
+    check('Ur""', is_literal=not is_py3)
-    check('UR""')
+    check('UR""', is_literal=not is_py3)
    check('bR""')
-    # Must be in the right order.
+    # Starting with Python 3.3 this ordering is also possible, but we just
-    with pytest.raises(AssertionError):
+    # enable it for all versions. It doesn't hurt.
-        check('Rb""')
+    check('Rb""')
    # Starting with Python 3.6 format strings where introduced.
    check('fr""', is_literal=py_version >= 36)
    check('rF""', is_literal=py_version >= 36)
    check('f""', is_literal=py_version >= 36)
    check('F""', is_literal=py_version >= 36)