Make the tokenizer version independent.

2025-12-07 05:14:29 +08:00 · 2017-07-11 23:29:44 +02:00
parent b6022c7a80
commit e731eecdd8
8 changed files with 180 additions and 134 deletions
--- a/parso/grammar.py
+++ b/parso/grammar.py
@@ -96,7 +96,9 @@ class Grammar(object):
                if old_lines == lines:
                    return module_node

-                new_node = self._diff_parser(self._pgen_grammar, module_node).update(
+                new_node = self._diff_parser(
+                    self._pgen_grammar, self._tokenizer, module_node
+                ).update(
                    old_lines=old_lines,
                    new_lines=lines
                )
@@ -106,7 +108,11 @@ class Grammar(object):

        tokens = self._tokenizer(lines)

-        p = self._parser(self._pgen_grammar, error_recovery=error_recovery, start_symbol=start_symbol)
+        p = self._parser(
+            self._pgen_grammar,
+            error_recovery=error_recovery,
+            start_symbol=start_symbol
+        )
        root_node = p.parse(tokens=tokens)

        if cache or diff_cache:
@@ -120,6 +126,20 @@ class Grammar(object):
        return '<%s:%s>' % (self.__class__.__name__, txt)


+class PythonGrammar(Grammar):
+    def __init__(self, version_int, bnf_text):
+        super(PythonGrammar, self).__init__(
+            bnf_text,
+            tokenizer=self._tokenize_lines,
+            parser=PythonParser,
+            diff_parser=DiffParser
+        )
+        self._version_int = version_int
+
+    def _tokenize_lines(self, lines):
+        return tokenize_lines(lines, self._version_int)
+
+
 def load_grammar(version=None):
    """
    Loads a Python grammar. The default version is the current Python version.
@@ -147,12 +167,7 @@ def load_grammar(version=None):
            with open(path) as f:
                bnf_text = f.read()

-            grammar = Grammar(
-                bnf_text,
-                tokenizer=tokenize_lines,
-                parser=PythonParser,
-                diff_parser=DiffParser
-            )
+            grammar = PythonGrammar(version_int, bnf_text)
            return _loaded_grammars.setdefault(path, grammar)
        except FileNotFoundError:
            message = "Python version %s is currently not supported." % version
--- a/parso/pgen2/pgen.py
+++ b/parso/pgen2/pgen.py
@@ -13,7 +13,7 @@ from parso.python import tokenize
 class ParserGenerator(object):
    def __init__(self, bnf_text):
        self._bnf_text = bnf_text
-        self.generator = tokenize.tokenize(bnf_text)
+        self.generator = tokenize.tokenize(bnf_text, version_int=36)
        self._gettoken()  # Initialize lookahead
        self.dfas, self.startsymbol = self._parse()
        self.first = {}  # map from symbol name to set of tokens
--- a/parso/python/diff.py
+++ b/parso/python/diff.py
@@ -13,7 +13,7 @@ import logging
 from parso.utils import splitlines
 from parso.python.parser import Parser
 from parso.python.tree import EndMarker
-from parso.python.tokenize import (tokenize_lines, NEWLINE, TokenInfo,
+from parso.python.tokenize import (NEWLINE, TokenInfo,
                                   ENDMARKER, INDENT, DEDENT, ERRORTOKEN)


@@ -89,8 +89,9 @@ class DiffParser(object):
    An advanced form of parsing a file faster. Unfortunately comes with huge
    side effects. It changes the given module.
    """
-    def __init__(self, pgen_grammar, module):
+    def __init__(self, pgen_grammar, tokenizer, module):
        self._pgen_grammar = pgen_grammar
+        self._tokenizer = tokenizer
        self._module = module

    def _reset(self):
@@ -286,7 +287,7 @@ class DiffParser(object):
        is_first_token = True
        omitted_first_indent = False
        indents = []
-        tokens = tokenize_lines(lines)
+        tokens = self._tokenizer(lines)
        stack = self._active_parser.pgen_parser.stack
        for typ, string, start_pos, prefix in tokens:
            start_pos = start_pos[0] + line_offset, start_pos[1]
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -11,6 +11,7 @@ memory optimizations here.
 """
 from __future__ import absolute_import

+import sys
 import string
 import re
 from collections import namedtuple
@@ -19,12 +20,19 @@ from codecs import BOM_UTF8

 from parso.python.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
                                NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
-from parso._compatibility import py_version, u
+from parso._compatibility import py_version
 from parso.utils import splitlines


+TokenCollection = namedtuple(
+    'TokenCollection',
+    'pseudo_token single_quoted triple_quoted endpats always_break_tokens',
+)
+
 BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')

+_token_collection_cache = {}
+
 if py_version >= 30:
    # Python 3 has str.isidentifier() to check if a char is a valid identifier
    is_identifier = str.isidentifier
@@ -46,55 +54,24 @@ def group(*choices, **kwargs):
        start += '?:'
    return start + '|'.join(choices) + ')'

+
 def any(*choices):
    return group(*choices) + '*'

+
 def maybe(*choices):
    return group(*choices) + '?'

-# Note: we use unicode matching for names ("\w") but ascii matching for
-# number literals.
-Whitespace = r'[ \f\t]*'
-Comment = r'#[^\r\n]*'
-Name = r'\w+'
-
-if py_version >= 36:
-    Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
-    Binnumber = r'0[bB](?:_?[01])+'
-    Octnumber = r'0[oO](?:_?[0-7])+'
-    Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
-    Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-    Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
-    Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
-                       r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
-    Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
-    Floatnumber = group(Pointfloat, Expfloat)
-    Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
-else:
-    Hexnumber = r'0[xX][0-9a-fA-F]+'
-    Binnumber = r'0[bB][01]+'
-    if py_version >= 30:
-        Octnumber = r'0[oO][0-7]+'
-    else:
-        Octnumber = '0[0-7]+'
-    Decnumber = r'(?:0+|[1-9][0-9]*)'
-    Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-    Exponent = r'[eE][-+]?[0-9]+'
-    Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
-    Expfloat = r'[0-9]+' + Exponent
-    Floatnumber = group(Pointfloat, Expfloat)
-    Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
-Number = group(Imagnumber, Floatnumber, Intnumber)

 # Return the empty string, plus all of the valid string prefixes.
-def _all_string_prefixes():
+def _all_string_prefixes(version_int):
    # The valid string prefixes. Only contain the lower case versions,
    #  and don't contain any permuations (include 'fr', but not
    #  'rf'). The various permutations will be generated.
    _valid_string_prefixes = ['b', 'r', 'u', 'br']
-    if py_version >= 36:
+    if version_int >= 36:
        _valid_string_prefixes += ['f', 'fr']
-    if py_version <= 27:
+    if version_int <= 27:
        # TODO this is actually not 100% valid. ur is valid in Python 2.7,
        # while ru is not.
        _valid_string_prefixes.append('ur')
@@ -109,70 +86,118 @@ def _all_string_prefixes():
                result.add(''.join(s))
    return result

+
 def _compile(expr):
    return re.compile(expr, re.UNICODE)

-# Note that since _all_string_prefixes includes the empty string,
-#  StringPrefix can be the empty string (making it optional).
-StringPrefix = group(*_all_string_prefixes())

-# Tail end of ' string.
-Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
-# Tail end of " string.
-Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
-# Tail end of ''' string.
-Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
-# Tail end of """ string.
-Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-Triple = group(StringPrefix + "'''", StringPrefix + '"""')
-
-# Because of leftmost-then-longest match semantics, be sure to put the
-# longest operators first (e.g., if = came before ==, == would get
-# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
-                 r"//=?", r"->",
-                 r"[+\-*/%&@|^=<>]=?",
-                 r"~")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-Funny = group(Operator, Bracket, Special)
-
-PlainToken = group(Number, Funny, Name, capture=True)
-
-# First (or only) line of ' or " string.
-ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
-                group("'", r'\\\r?\n'),
-                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
-                group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
-PseudoToken = group(Whitespace, capture=True) + \
-    group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
-
-# For a given string prefix plus quotes, endpats maps it to a regex
-#  to match the remainder of that string. _prefix can be empty, for
-#  a normal single or triple quoted string (with no prefix).
-endpats = {}
-for _prefix in _all_string_prefixes():
-    endpats[_prefix + "'"] = _compile(Single)
-    endpats[_prefix + '"'] = _compile(Double)
-    endpats[_prefix + "'''"] = _compile(Single3)
-    endpats[_prefix + '"""'] = _compile(Double3)
-
-# A set of all of the single and triple quoted string prefixes,
-#  including the opening quotes.
-single_quoted = set()
-triple_quoted = set()
-for t in _all_string_prefixes():
-    for p in (t + '"', t + "'"):
-        single_quoted.add(p)
-    for p in (t + '"""', t + "'''"):
-        triple_quoted.add(p)
+def _get_token_collection(version_int):
+    try:
+        return _token_collection_cache[version_int]
+    except KeyError:
+        _token_collection_cache[version_int] = result = \
+            _create_token_collection(version_int)
+        return result


-ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
-                       'finally', 'while', 'with', 'return')
-pseudo_token_compiled = _compile(PseudoToken)
+def _create_token_collection(version_int):
+    # Note: we use unicode matching for names ("\w") but ascii matching for
+    # number literals.
+    Whitespace = r'[ \f\t]*'
+    Comment = r'#[^\r\n]*'
+    Name = r'\w+'
+
+    if version_int >= 36:
+        Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
+        Binnumber = r'0[bB](?:_?[01])+'
+        Octnumber = r'0[oO](?:_?[0-7])+'
+        Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
+        Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
+        Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
+        Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
+                           r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
+        Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
+        Floatnumber = group(Pointfloat, Expfloat)
+        Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
+    else:
+        Hexnumber = r'0[xX][0-9a-fA-F]+'
+        Binnumber = r'0[bB][01]+'
+        if version_int >= 30:
+            Octnumber = r'0[oO][0-7]+'
+        else:
+            Octnumber = '0[0-7]+'
+        Decnumber = r'(?:0+|[1-9][0-9]*)'
+        Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
+        Exponent = r'[eE][-+]?[0-9]+'
+        Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
+        Expfloat = r'[0-9]+' + Exponent
+        Floatnumber = group(Pointfloat, Expfloat)
+        Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
+    Number = group(Imagnumber, Floatnumber, Intnumber)
+
+    # Note that since _all_string_prefixes includes the empty string,
+    #  StringPrefix can be the empty string (making it optional).
+    possible_prefixes = _all_string_prefixes(version_int)
+    StringPrefix = group(*possible_prefixes)
+
+    # Tail end of ' string.
+    Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
+    # Tail end of " string.
+    Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
+    # Tail end of ''' string.
+    Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
+    # Tail end of """ string.
+    Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
+    Triple = group(StringPrefix + "'''", StringPrefix + '"""')
+
+    # Because of leftmost-then-longest match semantics, be sure to put the
+    # longest operators first (e.g., if = came before ==, == would get
+    # recognized as two instances of =).
+    Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
+                     r"//=?", r"->",
+                     r"[+\-*/%&@|^=<>]=?",
+                     r"~")
+
+    Bracket = '[][(){}]'
+    Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
+    Funny = group(Operator, Bracket, Special)
+
+    # First (or only) line of ' or " string.
+    ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+                    group("'", r'\\\r?\n'),
+                    StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+                    group('"', r'\\\r?\n'))
+    PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
+    PseudoToken = group(Whitespace, capture=True) + \
+        group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
+
+    # For a given string prefix plus quotes, endpats maps it to a regex
+    #  to match the remainder of that string. _prefix can be empty, for
+    #  a normal single or triple quoted string (with no prefix).
+    endpats = {}
+    for _prefix in possible_prefixes:
+        endpats[_prefix + "'"] = _compile(Single)
+        endpats[_prefix + '"'] = _compile(Double)
+        endpats[_prefix + "'''"] = _compile(Single3)
+        endpats[_prefix + '"""'] = _compile(Double3)
+
+    # A set of all of the single and triple quoted string prefixes,
+    #  including the opening quotes.
+    single_quoted = set()
+    triple_quoted = set()
+    for t in possible_prefixes:
+        for p in (t + '"', t + "'"):
+            single_quoted.add(p)
+        for p in (t + '"""', t + "'''"):
+            triple_quoted.add(p)
+
+    ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
+                           'finally', 'while', 'with', 'return')
+    pseudo_token_compiled = _compile(PseudoToken)
+    return TokenCollection(
+        pseudo_token_compiled, single_quoted, triple_quoted, endpats,
+        ALWAYS_BREAK_TOKENS
+    )


 class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
@@ -203,13 +228,13 @@ class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
            return self.start_pos[0], self.start_pos[1] + len(self.string)


-def tokenize(code):
+def tokenize(code, version_int):
    """Generate tokens from a the source code (string)."""
    lines = splitlines(code, keepends=True)
-    return tokenize_lines(lines)
+    return tokenize_lines(lines, version_int)


-def tokenize_lines(lines):
+def tokenize_lines(lines, version_int):
    """
    A heavily modified Python standard library tokenizer.

@@ -217,6 +242,8 @@ def tokenize_lines(lines):
    token. This idea comes from lib2to3. The prefix contains all information
    that is irrelevant for the parser like newlines in parentheses or comments.
    """
+    pseudo_token, single_quoted, triple_quoted, endpats, always_break_tokens, = \
+        _get_token_collection(version_int)
    paren_level = 0  # count parentheses
    indents = [0]
    max = 0
@@ -252,7 +279,7 @@ def tokenize_lines(lines):
                continue

        while pos < max:
-            pseudomatch = pseudo_token_compiled.match(line, pos)
+            pseudomatch = pseudo_token.match(line, pos)
            if not pseudomatch:                             # scan for tokens
                txt = line[pos:]
                if txt.endswith('\n'):
@@ -329,7 +356,7 @@ def tokenize_lines(lines):
                else:                                       # ordinary string
                    yield TokenInfo(STRING, token, spos, prefix)
            elif is_identifier(initial):                      # ordinary name
-                if token in ALWAYS_BREAK_TOKENS:
+                if token in always_break_tokens:
                    paren_level = 0
                    while True:
                        indent = indents.pop()
@@ -370,12 +397,16 @@ def tokenize_lines(lines):


 if __name__ == "__main__":
-    import sys
    if len(sys.argv) >= 2:
        path = sys.argv[1]
        with open(path) as f:
-            code = u(f.read())
+            code = f.read()
    else:
-        code = u(sys.stdin.read())
+        code = sys.stdin.read()
+
+    if isinstance(code, bytes):
+        from parso.utils import source_to_unicode
+        code = source_to_unicode(code)
+
    for token in tokenize(code):
        print(token)
--- a/parso/utils.py
+++ b/parso/utils.py
@@ -108,7 +108,7 @@ def _parse_version(version):
    return int(major + minor)


-def version_string_to_int(version):
+def version_string_to_int(version=None):
    """
    Checks for a valid version number (e.g. `3.2` or `2.7.1` or `3`) and
    returns a corresponding int that is always two characters long in decimal.
--- a/test/test_diff_parser.py
+++ b/test/test_diff_parser.py
@@ -65,7 +65,11 @@ class Differ(object):
    def parse(self, code, copies=0, parsers=0, expect_error_leaves=False):
        logging.debug('differ: parse copies=%s parsers=%s', copies, parsers)
        lines = splitlines(code, keepends=True)
-        diff_parser = DiffParser(self.grammar._pgen_grammar, self.module)
+        diff_parser = DiffParser(
+            self.grammar._pgen_grammar,
+            self.grammar._tokenizer,
+            self.module,
+        )
        new_module = diff_parser.update(self.lines, lines)
        self.lines = lines
        assert code == new_module.get_code()
--- a/test/test_normalizer_issues_files.py
+++ b/test/test_normalizer_issues_files.py
@@ -5,7 +5,7 @@ tests of pydocstyle.

 import difflib
 import re
-from _compatibility import total_ordering
+from test._compatibility import total_ordering

 import parso
 from parso.utils import source_to_unicode
--- a/test/test_tokenize.py
+++ b/test/test_tokenize.py
@@ -5,7 +5,7 @@ from textwrap import dedent
 import pytest

 from parso._compatibility import py_version
-from parso.utils import splitlines
+from parso.utils import splitlines, version_string_to_int
 from parso.python.token import (
    NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER)
 from parso.python import tokenize
@@ -14,7 +14,9 @@ from parso.python.tokenize import TokenInfo


 def _get_token_list(string):
-    return list(tokenize.tokenize(string))
+    # Load the current version.
+    version_int = version_string_to_int()
+    return list(tokenize.tokenize(string, version_int))


 def test_end_pos_one_line():
@@ -41,8 +43,7 @@ def test_end_pos_multi_line():
 def test_simple_no_whitespace():
    # Test a simple one line string, no preceding whitespace
    simple_docstring = '"""simple one line docstring"""'
-    tokens = tokenize.tokenize(simple_docstring)
-    token_list = list(tokens)
+    token_list = _get_token_list(simple_docstring)
    _, value, _, prefix = token_list[0]
    assert prefix == ''
    assert value == '"""simple one line docstring"""'
@@ -51,8 +52,7 @@ def test_simple_no_whitespace():
 def test_simple_with_whitespace():
    # Test a simple one line string with preceding whitespace and newline
    simple_docstring = '  """simple one line docstring""" \r\n'
-    tokens = tokenize.tokenize(simple_docstring)
-    token_list = list(tokens)
+    token_list = _get_token_list(simple_docstring)
    assert token_list[0][0] == INDENT
    typ, value, start_pos, prefix = token_list[1]
    assert prefix == '  '
@@ -71,8 +71,7 @@ def test_function_whitespace():
        if x > 0:
            print(True)
    ''')
-    tokens = tokenize.tokenize(fundef)
-    token_list = list(tokens)
+    token_list = _get_token_list(fundef)
    for _, value, _, prefix in token_list:
        if value == 'test_whitespace':
            assert prefix == ' '
@@ -92,8 +91,7 @@ def test_tokenize_multiline_I():
    # Make sure multiline string having newlines have the end marker on the
    # next line
    fundef = '''""""\n'''
-    tokens = tokenize.tokenize(fundef)
-    token_list = list(tokens)
+    token_list = _get_token_list(fundef)
    assert token_list == [TokenInfo(ERRORTOKEN, '""""\n', (1, 0), ''),
                          TokenInfo(ENDMARKER ,       '', (2, 0), '')]

@@ -102,8 +100,7 @@ def test_tokenize_multiline_II():
    # Make sure multiline string having no newlines have the end marker on
    # same line
    fundef = '''""""'''
-    tokens = tokenize.tokenize(fundef)
-    token_list = list(tokens)
+    token_list = _get_token_list(fundef)
    assert token_list == [TokenInfo(ERRORTOKEN, '""""', (1, 0), ''),
                          TokenInfo(ENDMARKER,      '', (1, 4), '')]

@@ -112,8 +109,7 @@ def test_tokenize_multiline_III():
    # Make sure multiline string having newlines have the end marker on the
    # next line even if several newline
    fundef = '''""""\n\n'''
-    tokens = tokenize.tokenize(fundef)
-    token_list = list(tokens)
+    token_list = _get_token_list(fundef)
    assert token_list == [TokenInfo(ERRORTOKEN, '""""\n\n', (1, 0), ''),
                          TokenInfo(ENDMARKER,          '', (3, 0), '')]

@@ -123,8 +119,7 @@ def test_identifier_contains_unicode():
    def 我あφ():
        pass
    ''')
-    tokens = tokenize.tokenize(fundef)
-    token_list = list(tokens)
+    token_list = _get_token_list(fundef)
    unicode_token = token_list[1]
    if py_version >= 30:
        assert unicode_token[0] == NAME