diff --git a/parso/grammar.py b/parso/grammar.py index 2bf8b10..87b57e6 100644 --- a/parso/grammar.py +++ b/parso/grammar.py @@ -96,7 +96,9 @@ class Grammar(object): if old_lines == lines: return module_node - new_node = self._diff_parser(self._pgen_grammar, module_node).update( + new_node = self._diff_parser( + self._pgen_grammar, self._tokenizer, module_node + ).update( old_lines=old_lines, new_lines=lines ) @@ -106,7 +108,11 @@ class Grammar(object): tokens = self._tokenizer(lines) - p = self._parser(self._pgen_grammar, error_recovery=error_recovery, start_symbol=start_symbol) + p = self._parser( + self._pgen_grammar, + error_recovery=error_recovery, + start_symbol=start_symbol + ) root_node = p.parse(tokens=tokens) if cache or diff_cache: @@ -120,6 +126,20 @@ class Grammar(object): return '<%s:%s>' % (self.__class__.__name__, txt) +class PythonGrammar(Grammar): + def __init__(self, version_int, bnf_text): + super(PythonGrammar, self).__init__( + bnf_text, + tokenizer=self._tokenize_lines, + parser=PythonParser, + diff_parser=DiffParser + ) + self._version_int = version_int + + def _tokenize_lines(self, lines): + return tokenize_lines(lines, self._version_int) + + def load_grammar(version=None): """ Loads a Python grammar. The default version is the current Python version. @@ -147,12 +167,7 @@ def load_grammar(version=None): with open(path) as f: bnf_text = f.read() - grammar = Grammar( - bnf_text, - tokenizer=tokenize_lines, - parser=PythonParser, - diff_parser=DiffParser - ) + grammar = PythonGrammar(version_int, bnf_text) return _loaded_grammars.setdefault(path, grammar) except FileNotFoundError: message = "Python version %s is currently not supported." % version diff --git a/parso/pgen2/pgen.py b/parso/pgen2/pgen.py index 60e4838..75bd2ea 100644 --- a/parso/pgen2/pgen.py +++ b/parso/pgen2/pgen.py @@ -13,7 +13,7 @@ from parso.python import tokenize class ParserGenerator(object): def __init__(self, bnf_text): self._bnf_text = bnf_text - self.generator = tokenize.tokenize(bnf_text) + self.generator = tokenize.tokenize(bnf_text, version_int=36) self._gettoken() # Initialize lookahead self.dfas, self.startsymbol = self._parse() self.first = {} # map from symbol name to set of tokens diff --git a/parso/python/diff.py b/parso/python/diff.py index a98ae79..6b98636 100644 --- a/parso/python/diff.py +++ b/parso/python/diff.py @@ -13,7 +13,7 @@ import logging from parso.utils import splitlines from parso.python.parser import Parser from parso.python.tree import EndMarker -from parso.python.tokenize import (tokenize_lines, NEWLINE, TokenInfo, +from parso.python.tokenize import (NEWLINE, TokenInfo, ENDMARKER, INDENT, DEDENT, ERRORTOKEN) @@ -89,8 +89,9 @@ class DiffParser(object): An advanced form of parsing a file faster. Unfortunately comes with huge side effects. It changes the given module. """ - def __init__(self, pgen_grammar, module): + def __init__(self, pgen_grammar, tokenizer, module): self._pgen_grammar = pgen_grammar + self._tokenizer = tokenizer self._module = module def _reset(self): @@ -286,7 +287,7 @@ class DiffParser(object): is_first_token = True omitted_first_indent = False indents = [] - tokens = tokenize_lines(lines) + tokens = self._tokenizer(lines) stack = self._active_parser.pgen_parser.stack for typ, string, start_pos, prefix in tokens: start_pos = start_pos[0] + line_offset, start_pos[1] diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 19a3a8f..43f4547 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -11,6 +11,7 @@ memory optimizations here. """ from __future__ import absolute_import +import sys import string import re from collections import namedtuple @@ -19,12 +20,19 @@ from codecs import BOM_UTF8 from parso.python.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap, NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT) -from parso._compatibility import py_version, u +from parso._compatibility import py_version from parso.utils import splitlines +TokenCollection = namedtuple( + 'TokenCollection', + 'pseudo_token single_quoted triple_quoted endpats always_break_tokens', +) + BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') +_token_collection_cache = {} + if py_version >= 30: # Python 3 has str.isidentifier() to check if a char is a valid identifier is_identifier = str.isidentifier @@ -46,55 +54,24 @@ def group(*choices, **kwargs): start += '?:' return start + '|'.join(choices) + ')' + def any(*choices): return group(*choices) + '*' + def maybe(*choices): return group(*choices) + '?' -# Note: we use unicode matching for names ("\w") but ascii matching for -# number literals. -Whitespace = r'[ \f\t]*' -Comment = r'#[^\r\n]*' -Name = r'\w+' - -if py_version >= 36: - Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' - Binnumber = r'0[bB](?:_?[01])+' - Octnumber = r'0[oO](?:_?[0-7])+' - Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' - Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) - Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' - Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', - r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) - Expfloat = r'[0-9](?:_?[0-9])*' + Exponent - Floatnumber = group(Pointfloat, Expfloat) - Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') -else: - Hexnumber = r'0[xX][0-9a-fA-F]+' - Binnumber = r'0[bB][01]+' - if py_version >= 30: - Octnumber = r'0[oO][0-7]+' - else: - Octnumber = '0[0-7]+' - Decnumber = r'(?:0+|[1-9][0-9]*)' - Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) - Exponent = r'[eE][-+]?[0-9]+' - Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) - Expfloat = r'[0-9]+' + Exponent - Floatnumber = group(Pointfloat, Expfloat) - Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') -Number = group(Imagnumber, Floatnumber, Intnumber) # Return the empty string, plus all of the valid string prefixes. -def _all_string_prefixes(): +def _all_string_prefixes(version_int): # The valid string prefixes. Only contain the lower case versions, # and don't contain any permuations (include 'fr', but not # 'rf'). The various permutations will be generated. _valid_string_prefixes = ['b', 'r', 'u', 'br'] - if py_version >= 36: + if version_int >= 36: _valid_string_prefixes += ['f', 'fr'] - if py_version <= 27: + if version_int <= 27: # TODO this is actually not 100% valid. ur is valid in Python 2.7, # while ru is not. _valid_string_prefixes.append('ur') @@ -109,70 +86,118 @@ def _all_string_prefixes(): result.add(''.join(s)) return result + def _compile(expr): return re.compile(expr, re.UNICODE) -# Note that since _all_string_prefixes includes the empty string, -# StringPrefix can be the empty string (making it optional). -StringPrefix = group(*_all_string_prefixes()) -# Tail end of ' string. -Single = r"[^'\\]*(?:\\.[^'\\]*)*'" -# Tail end of " string. -Double = r'[^"\\]*(?:\\.[^"\\]*)*"' -# Tail end of ''' string. -Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" -# Tail end of """ string. -Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -Triple = group(StringPrefix + "'''", StringPrefix + '"""') - -# Because of leftmost-then-longest match semantics, be sure to put the -# longest operators first (e.g., if = came before ==, == would get -# recognized as two instances of =). -Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", - r"//=?", r"->", - r"[+\-*/%&@|^=<>]=?", - r"~") - -Bracket = '[][(){}]' -Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') -Funny = group(Operator, Bracket, Special) - -PlainToken = group(Number, Funny, Name, capture=True) - -# First (or only) line of ' or " string. -ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + - group("'", r'\\\r?\n'), - StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + - group('"', r'\\\r?\n')) -PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) -PseudoToken = group(Whitespace, capture=True) + \ - group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) - -# For a given string prefix plus quotes, endpats maps it to a regex -# to match the remainder of that string. _prefix can be empty, for -# a normal single or triple quoted string (with no prefix). -endpats = {} -for _prefix in _all_string_prefixes(): - endpats[_prefix + "'"] = _compile(Single) - endpats[_prefix + '"'] = _compile(Double) - endpats[_prefix + "'''"] = _compile(Single3) - endpats[_prefix + '"""'] = _compile(Double3) - -# A set of all of the single and triple quoted string prefixes, -# including the opening quotes. -single_quoted = set() -triple_quoted = set() -for t in _all_string_prefixes(): - for p in (t + '"', t + "'"): - single_quoted.add(p) - for p in (t + '"""', t + "'''"): - triple_quoted.add(p) +def _get_token_collection(version_int): + try: + return _token_collection_cache[version_int] + except KeyError: + _token_collection_cache[version_int] = result = \ + _create_token_collection(version_int) + return result -ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', - 'finally', 'while', 'with', 'return') -pseudo_token_compiled = _compile(PseudoToken) +def _create_token_collection(version_int): + # Note: we use unicode matching for names ("\w") but ascii matching for + # number literals. + Whitespace = r'[ \f\t]*' + Comment = r'#[^\r\n]*' + Name = r'\w+' + + if version_int >= 36: + Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' + Binnumber = r'0[bB](?:_?[01])+' + Octnumber = r'0[oO](?:_?[0-7])+' + Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' + Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) + Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' + Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', + r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) + Expfloat = r'[0-9](?:_?[0-9])*' + Exponent + Floatnumber = group(Pointfloat, Expfloat) + Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') + else: + Hexnumber = r'0[xX][0-9a-fA-F]+' + Binnumber = r'0[bB][01]+' + if version_int >= 30: + Octnumber = r'0[oO][0-7]+' + else: + Octnumber = '0[0-7]+' + Decnumber = r'(?:0+|[1-9][0-9]*)' + Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) + Exponent = r'[eE][-+]?[0-9]+' + Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) + Expfloat = r'[0-9]+' + Exponent + Floatnumber = group(Pointfloat, Expfloat) + Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') + Number = group(Imagnumber, Floatnumber, Intnumber) + + # Note that since _all_string_prefixes includes the empty string, + # StringPrefix can be the empty string (making it optional). + possible_prefixes = _all_string_prefixes(version_int) + StringPrefix = group(*possible_prefixes) + + # Tail end of ' string. + Single = r"[^'\\]*(?:\\.[^'\\]*)*'" + # Tail end of " string. + Double = r'[^"\\]*(?:\\.[^"\\]*)*"' + # Tail end of ''' string. + Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" + # Tail end of """ string. + Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' + Triple = group(StringPrefix + "'''", StringPrefix + '"""') + + # Because of leftmost-then-longest match semantics, be sure to put the + # longest operators first (e.g., if = came before ==, == would get + # recognized as two instances of =). + Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", + r"//=?", r"->", + r"[+\-*/%&@|^=<>]=?", + r"~") + + Bracket = '[][(){}]' + Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') + Funny = group(Operator, Bracket, Special) + + # First (or only) line of ' or " string. + ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + + group("'", r'\\\r?\n'), + StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + + group('"', r'\\\r?\n')) + PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) + PseudoToken = group(Whitespace, capture=True) + \ + group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) + + # For a given string prefix plus quotes, endpats maps it to a regex + # to match the remainder of that string. _prefix can be empty, for + # a normal single or triple quoted string (with no prefix). + endpats = {} + for _prefix in possible_prefixes: + endpats[_prefix + "'"] = _compile(Single) + endpats[_prefix + '"'] = _compile(Double) + endpats[_prefix + "'''"] = _compile(Single3) + endpats[_prefix + '"""'] = _compile(Double3) + + # A set of all of the single and triple quoted string prefixes, + # including the opening quotes. + single_quoted = set() + triple_quoted = set() + for t in possible_prefixes: + for p in (t + '"', t + "'"): + single_quoted.add(p) + for p in (t + '"""', t + "'''"): + triple_quoted.add(p) + + ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', + 'finally', 'while', 'with', 'return') + pseudo_token_compiled = _compile(PseudoToken) + return TokenCollection( + pseudo_token_compiled, single_quoted, triple_quoted, endpats, + ALWAYS_BREAK_TOKENS + ) class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])): @@ -203,13 +228,13 @@ class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])): return self.start_pos[0], self.start_pos[1] + len(self.string) -def tokenize(code): +def tokenize(code, version_int): """Generate tokens from a the source code (string).""" lines = splitlines(code, keepends=True) - return tokenize_lines(lines) + return tokenize_lines(lines, version_int) -def tokenize_lines(lines): +def tokenize_lines(lines, version_int): """ A heavily modified Python standard library tokenizer. @@ -217,6 +242,8 @@ def tokenize_lines(lines): token. This idea comes from lib2to3. The prefix contains all information that is irrelevant for the parser like newlines in parentheses or comments. """ + pseudo_token, single_quoted, triple_quoted, endpats, always_break_tokens, = \ + _get_token_collection(version_int) paren_level = 0 # count parentheses indents = [0] max = 0 @@ -252,7 +279,7 @@ def tokenize_lines(lines): continue while pos < max: - pseudomatch = pseudo_token_compiled.match(line, pos) + pseudomatch = pseudo_token.match(line, pos) if not pseudomatch: # scan for tokens txt = line[pos:] if txt.endswith('\n'): @@ -329,7 +356,7 @@ def tokenize_lines(lines): else: # ordinary string yield TokenInfo(STRING, token, spos, prefix) elif is_identifier(initial): # ordinary name - if token in ALWAYS_BREAK_TOKENS: + if token in always_break_tokens: paren_level = 0 while True: indent = indents.pop() @@ -370,12 +397,16 @@ def tokenize_lines(lines): if __name__ == "__main__": - import sys if len(sys.argv) >= 2: path = sys.argv[1] with open(path) as f: - code = u(f.read()) + code = f.read() else: - code = u(sys.stdin.read()) + code = sys.stdin.read() + + if isinstance(code, bytes): + from parso.utils import source_to_unicode + code = source_to_unicode(code) + for token in tokenize(code): print(token) diff --git a/parso/utils.py b/parso/utils.py index 047643a..5d2abc0 100644 --- a/parso/utils.py +++ b/parso/utils.py @@ -108,7 +108,7 @@ def _parse_version(version): return int(major + minor) -def version_string_to_int(version): +def version_string_to_int(version=None): """ Checks for a valid version number (e.g. `3.2` or `2.7.1` or `3`) and returns a corresponding int that is always two characters long in decimal. diff --git a/test/test_diff_parser.py b/test/test_diff_parser.py index ceca69b..e0985fa 100644 --- a/test/test_diff_parser.py +++ b/test/test_diff_parser.py @@ -65,7 +65,11 @@ class Differ(object): def parse(self, code, copies=0, parsers=0, expect_error_leaves=False): logging.debug('differ: parse copies=%s parsers=%s', copies, parsers) lines = splitlines(code, keepends=True) - diff_parser = DiffParser(self.grammar._pgen_grammar, self.module) + diff_parser = DiffParser( + self.grammar._pgen_grammar, + self.grammar._tokenizer, + self.module, + ) new_module = diff_parser.update(self.lines, lines) self.lines = lines assert code == new_module.get_code() diff --git a/test/test_normalizer_issues_files.py b/test/test_normalizer_issues_files.py index 8519187..b0672c7 100644 --- a/test/test_normalizer_issues_files.py +++ b/test/test_normalizer_issues_files.py @@ -5,7 +5,7 @@ tests of pydocstyle. import difflib import re -from _compatibility import total_ordering +from test._compatibility import total_ordering import parso from parso.utils import source_to_unicode diff --git a/test/test_tokenize.py b/test/test_tokenize.py index 9ae56e9..885dfff 100644 --- a/test/test_tokenize.py +++ b/test/test_tokenize.py @@ -5,7 +5,7 @@ from textwrap import dedent import pytest from parso._compatibility import py_version -from parso.utils import splitlines +from parso.utils import splitlines, version_string_to_int from parso.python.token import ( NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER) from parso.python import tokenize @@ -14,7 +14,9 @@ from parso.python.tokenize import TokenInfo def _get_token_list(string): - return list(tokenize.tokenize(string)) + # Load the current version. + version_int = version_string_to_int() + return list(tokenize.tokenize(string, version_int)) def test_end_pos_one_line(): @@ -41,8 +43,7 @@ def test_end_pos_multi_line(): def test_simple_no_whitespace(): # Test a simple one line string, no preceding whitespace simple_docstring = '"""simple one line docstring"""' - tokens = tokenize.tokenize(simple_docstring) - token_list = list(tokens) + token_list = _get_token_list(simple_docstring) _, value, _, prefix = token_list[0] assert prefix == '' assert value == '"""simple one line docstring"""' @@ -51,8 +52,7 @@ def test_simple_no_whitespace(): def test_simple_with_whitespace(): # Test a simple one line string with preceding whitespace and newline simple_docstring = ' """simple one line docstring""" \r\n' - tokens = tokenize.tokenize(simple_docstring) - token_list = list(tokens) + token_list = _get_token_list(simple_docstring) assert token_list[0][0] == INDENT typ, value, start_pos, prefix = token_list[1] assert prefix == ' ' @@ -71,8 +71,7 @@ def test_function_whitespace(): if x > 0: print(True) ''') - tokens = tokenize.tokenize(fundef) - token_list = list(tokens) + token_list = _get_token_list(fundef) for _, value, _, prefix in token_list: if value == 'test_whitespace': assert prefix == ' ' @@ -92,8 +91,7 @@ def test_tokenize_multiline_I(): # Make sure multiline string having newlines have the end marker on the # next line fundef = '''""""\n''' - tokens = tokenize.tokenize(fundef) - token_list = list(tokens) + token_list = _get_token_list(fundef) assert token_list == [TokenInfo(ERRORTOKEN, '""""\n', (1, 0), ''), TokenInfo(ENDMARKER , '', (2, 0), '')] @@ -102,8 +100,7 @@ def test_tokenize_multiline_II(): # Make sure multiline string having no newlines have the end marker on # same line fundef = '''""""''' - tokens = tokenize.tokenize(fundef) - token_list = list(tokens) + token_list = _get_token_list(fundef) assert token_list == [TokenInfo(ERRORTOKEN, '""""', (1, 0), ''), TokenInfo(ENDMARKER, '', (1, 4), '')] @@ -112,8 +109,7 @@ def test_tokenize_multiline_III(): # Make sure multiline string having newlines have the end marker on the # next line even if several newline fundef = '''""""\n\n''' - tokens = tokenize.tokenize(fundef) - token_list = list(tokens) + token_list = _get_token_list(fundef) assert token_list == [TokenInfo(ERRORTOKEN, '""""\n\n', (1, 0), ''), TokenInfo(ENDMARKER, '', (3, 0), '')] @@ -123,8 +119,7 @@ def test_identifier_contains_unicode(): def 我あφ(): pass ''') - tokens = tokenize.tokenize(fundef) - token_list = list(tokens) + token_list = _get_token_list(fundef) unicode_token = token_list[1] if py_version >= 30: assert unicode_token[0] == NAME