Update the tokenizer to include f literals and underscores. Need tests still.

2017-01-08 16:03:45 +01:00
parent 3f09f3a304
commit 00a9f1ec0a
3 changed files with 123 additions and 107 deletions
--- a/jedi/_compatibility.py
+++ b/jedi/_compatibility.py
@@ -12,11 +12,12 @@ try:
 except ImportError:
    pass

-is_py3 = sys.version_info[0] >= 3
+is_py3 = sys.version_info.major >= 3
 is_py33 = is_py3 and sys.version_info.minor >= 3
 is_py34 = is_py3 and sys.version_info.minor >= 4
 is_py35 = is_py3 and sys.version_info.minor >= 5
 is_py26 = not is_py3 and sys.version_info[1] < 7
+py_version = int(str(sys.version_info.major) + str(sys.version_info.minor))


 class DummyFile(object):
--- a/jedi/parser/tokenize.py
+++ b/jedi/parser/tokenize.py
@@ -15,10 +15,11 @@ import string
 import re
 from collections import namedtuple
 from io import StringIO
+import itertools as _itertools

 from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
                               NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
-from jedi._compatibility import is_py3
+from jedi._compatibility import is_py3, py_version
 from jedi.common import splitlines


@@ -37,121 +38,127 @@ COMMENT = N_TOKENS
 tok_name[COMMENT] = 'COMMENT'


-def group(*choices):
-    return '(' + '|'.join(choices) + ')'
+def group(*choices, **kwargs):
+    capture = kwargs.pop('capture', False)  # Python 2, arrghhhhh :(
+    assert not kwargs

+    start = '('
+    if not capture:
+        start += '?:'
+    return start + '|'.join(choices) + ')'
+
+def any(*choices):
+    return group(*choices) + '*'

 def maybe(*choices):
    return group(*choices) + '?'

-
 # Note: we use unicode matching for names ("\w") but ascii matching for
 # number literals.
-whitespace = r'[ \f\t]*'
-comment = r'#[^\r\n]*'
-name = r'\w+'
+Whitespace = r'[ \f\t]*'
+Comment = r'#[^\r\n]*'
+Name = r'\w+'

-hex_number = r'0[xX][0-9a-fA-F]+'
-bin_number = r'0[bB][01]+'
+Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
+Binnumber = r'0[bB](?:_?[01])+'
 if is_py3:
-    oct_number = r'0[oO][0-7]+'
+    Octnumber = r'0[oO](?:_?[0-7])+'
 else:
-    oct_number = '0[0-7]+'
-dec_number = r'(?:0+|[1-9][0-9]*)'
-int_number = group(hex_number, bin_number, oct_number, dec_number)
-exponent = r'[eE][-+]?[0-9]+'
-point_float = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(exponent)
-Expfloat = r'[0-9]+' + exponent
-float_number = group(point_float, Expfloat)
-imag_number = group(r'[0-9]+[jJ]', float_number + r'[jJ]')
-number = group(imag_number, float_number, int_number)
+    Octnumber = '0[0-7]+'

-# Tail end of ' string.
-single = r"[^'\\]*(?:\\.[^'\\]*)*'"
-# Tail end of " string.
-double = r'[^"\\]*(?:\\.[^"\\]*)*"'
-# Tail end of ''' string.
-single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
-# Tail end of """ string.
-double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
-# Single-line ' or " string.
-
-# Because of leftmost-then-longest match semantics, be sure to put the
-# longest operators first (e.g., if = came before ==, == would get
-# recognized as two instances of =).
-operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
-                 r"//=?", r"->",
-                 r"[+\-*@/%&|^=<>]=?",
-                 r"~")
-
-bracket = '[][(){}]'
-special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-funny = group(operator, bracket, special)
-
-# First (or only) line of ' or " string.
-cont_str = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
-                 group("'", r'\\\r?\n'),
-                 r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
-                 group('"', r'\\\r?\n'))
-pseudo_extras = group(r'\\\r?\n', comment, triple)
-pseudo_token = group(whitespace) + \
-    group(pseudo_extras, number, funny, cont_str, name)
+Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
+Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
+Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
+Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
+                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
+Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
+Floatnumber = group(Pointfloat, Expfloat)
+Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
+Number = group(Imagnumber, Floatnumber, Intnumber)

+# Return the empty string, plus all of the valid string prefixes.
+def _all_string_prefixes():
+    # The valid string prefixes. Only contain the lower case versions,
+    #  and don't contain any permuations (include 'fr', but not
+    #  'rf'). The various permutations will be generated.
+    _valid_string_prefixes = ['b', 'r', 'u', 'br']
+    if py_version >= 36:
+        _valid_string_prefixes += ['f', 'fr']
+    # if we add binary f-strings, add: ['fb', 'fbr']
+    result = set([''])
+    for prefix in _valid_string_prefixes:
+        for t in _itertools.permutations(prefix):
+            # create a list with upper and lower versions of each
+            #  character
+            for u in _itertools.product(*[(c, c.upper()) for c in t]):
+                result.add(''.join(u))
+    return result

 def _compile(expr):
    return re.compile(expr, re.UNICODE)

+# Note that since _all_string_prefixes includes the empty string,
+#  StringPrefix can be the empty string (making it optional).
+StringPrefix = group(*_all_string_prefixes())

-pseudoprog, single3prog, double3prog = map(
-    _compile, (pseudo_token, single3, double3))
+# Tail end of ' string.
+Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
+# Tail end of " string.
+Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
+# Tail end of ''' string.
+Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
+# Tail end of """ string.
+Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
+Triple = group(StringPrefix + "'''", StringPrefix + '"""')

-endprogs = {"'": _compile(single), '"': _compile(double),
-            "'''": single3prog, '"""': double3prog,
-            "r'''": single3prog, 'r"""': double3prog,
-            "b'''": single3prog, 'b"""': double3prog,
-            "u'''": single3prog, 'u"""': double3prog,
-            "R'''": single3prog, 'R"""': double3prog,
-            "B'''": single3prog, 'B"""': double3prog,
-            "U'''": single3prog, 'U"""': double3prog,
-            "br'''": single3prog, 'br"""': double3prog,
-            "bR'''": single3prog, 'bR"""': double3prog,
-            "Br'''": single3prog, 'Br"""': double3prog,
-            "BR'''": single3prog, 'BR"""': double3prog,
-            "ur'''": single3prog, 'ur"""': double3prog,
-            "uR'''": single3prog, 'uR"""': double3prog,
-            "Ur'''": single3prog, 'Ur"""': double3prog,
-            "UR'''": single3prog, 'UR"""': double3prog,
-            'r': None, 'R': None, 'b': None, 'B': None}
+# Because of leftmost-then-longest match semantics, be sure to put the
+# longest operators first (e.g., if = came before ==, == would get
+# recognized as two instances of =).
+Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
+                 r"//=?", r"->",
+                 r"[+\-*/%&@|^=<>]=?",
+                 r"~")

-triple_quoted = {}
-for t in ("'''", '"""',
-          "r'''", 'r"""', "R'''", 'R"""',
-          "b'''", 'b"""', "B'''", 'B"""',
-          "u'''", 'u"""', "U'''", 'U"""',
-          "br'''", 'br"""', "Br'''", 'Br"""',
-          "bR'''", 'bR"""', "BR'''", 'BR"""',
-          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
-          "uR'''", 'uR"""', "UR'''", 'UR"""'):
-    triple_quoted[t] = t
-single_quoted = {}
-for t in ("'", '"',
-          "r'", 'r"', "R'", 'R"',
-          "b'", 'b"', "B'", 'B"',
-          "u'", 'u"', "U'", 'U"',
-          "br'", 'br"', "Br'", 'Br"',
-          "bR'", 'bR"', "BR'", 'BR"',
-          "ur'", 'ur"', "Ur'", 'Ur"',
-          "uR'", 'uR"', "UR'", 'UR"'):
-    single_quoted[t] = t
+Bracket = '[][(){}]'
+Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
+Funny = group(Operator, Bracket, Special)

-del _compile
+PlainToken = group(Number, Funny, Name, capture=True)
+
+# First (or only) line of ' or " string.
+ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+                group("'", r'\\\r?\n'),
+                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+                group('"', r'\\\r?\n'))
+PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
+PseudoToken = group(Whitespace, capture=True) + \
+    group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
+
+# For a given string prefix plus quotes, endpats maps it to a regex
+#  to match the remainder of that string. _prefix can be empty, for
+#  a normal single or triple quoted string (with no prefix).
+endpats = {}
+for _prefix in _all_string_prefixes():
+    endpats[_prefix + "'"] = _compile(Single)
+    endpats[_prefix + '"'] = _compile(Double)
+    endpats[_prefix + "'''"] = _compile(Single3)
+    endpats[_prefix + '"""'] = _compile(Double3)
+
+# A set of all of the single and triple quoted string prefixes,
+#  including the opening quotes.
+single_quoted = set()
+triple_quoted = set()
+for t in _all_string_prefixes():
+    for u in (t + '"', t + "'"):
+        single_quoted.add(u)
+    for u in (t + '"""', t + "'''"):
+        triple_quoted.add(u)

-tabsize = 8

 # TODO add with?
 ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
                       'finally', 'while', 'return')
+pseudo_token_compiled = _compile(PseudoToken)


 class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
@@ -228,7 +235,7 @@ def generate_tokens(readline, use_exact_op_types=False):
                continue

        while pos < max:
-            pseudomatch = pseudoprog.match(line, pos)
+            pseudomatch = pseudo_token_compiled.match(line, pos)
            if not pseudomatch:                             # scan for tokens
                txt = line[pos]
                if line[pos] in '"\'':
@@ -272,7 +279,7 @@ def generate_tokens(readline, use_exact_op_types=False):
                assert not token.endswith("\n")
                additional_prefix = prefix + token
            elif token in triple_quoted:
-                endprog = endprogs[token]
+                endprog = endpats[token]
                endmatch = endprog.match(line, pos)
                if endmatch:                                # all on one line
                    pos = endmatch.end(0)
@@ -288,8 +295,8 @@ def generate_tokens(readline, use_exact_op_types=False):
                    token[:3] in single_quoted:
                if token[-1] == '\n':                       # continued string
                    contstr_start = lnum, start
-                    endprog = (endprogs.get(initial) or endprogs.get(token[1])
-                               or endprogs.get(token[2]))
+                    endprog = (endpats.get(initial) or endpats.get(token[1])
+                               or endpats.get(token[2]))
                    contstr = line[start:]
                    contline = line
                    break
--- a/test/test_parser/test_tokenize.py
+++ b/test/test_parser/test_tokenize.py
@@ -5,7 +5,7 @@ from textwrap import dedent

 import pytest

-from jedi._compatibility import u, is_py3
+from jedi._compatibility import u, is_py3, py_version
 from jedi.parser.token import NAME, OP, NEWLINE, STRING, INDENT
 from jedi.parser import ParserWithRecovery, load_grammar, tokenize

@@ -129,24 +129,32 @@ def test_ur_literals():
    Decided to parse `u''` literals regardless of Python version. This makes
    probably sense:

-    - Python 3.2 doesn't support it and is still supported by Jedi, but might
+    - Python 3+ doesn't support it, but it doesn't hurt
      not be. While this is incorrect, it's just incorrect for one "old" and in
      the future not very important version.
    - All the other Python versions work very well with it.
    """
-    def check(literal):
+    def check(literal, is_literal=True):
        io = StringIO(u(literal))
        tokens = tokenize.generate_tokens(io.readline)
        token_list = list(tokens)
        typ, result_literal, _, _ = token_list[0]
-        assert typ == STRING
-        assert result_literal == literal
+        if is_literal:
+            assert typ == STRING
+            assert result_literal == literal
+        else:
+            assert typ == NAME

    check('u""')
-    check('ur""')
-    check('Ur""')
-    check('UR""')
+    check('ur""', is_literal=not is_py3)
+    check('Ur""', is_literal=not is_py3)
+    check('UR""', is_literal=not is_py3)
    check('bR""')
-    # Must be in the right order.
-    with pytest.raises(AssertionError):
-        check('Rb""')
+    # Starting with Python 3.3 this ordering is also possible, but we just
+    # enable it for all versions. It doesn't hurt.
+    check('Rb""')
+    # Starting with Python 3.6 format strings where introduced.
+    check('fr""', is_literal=py_version >= 36)
+    check('rF""', is_literal=py_version >= 36)
+    check('f""', is_literal=py_version >= 36)
+    check('F""', is_literal=py_version >= 36)