Make the tokenizer version independent.

This commit is contained in:
Dave Halter
2017-07-11 23:29:44 +02:00
parent b6022c7a80
commit e731eecdd8
8 changed files with 180 additions and 134 deletions

View File

@@ -96,7 +96,9 @@ class Grammar(object):
if old_lines == lines: if old_lines == lines:
return module_node return module_node
new_node = self._diff_parser(self._pgen_grammar, module_node).update( new_node = self._diff_parser(
self._pgen_grammar, self._tokenizer, module_node
).update(
old_lines=old_lines, old_lines=old_lines,
new_lines=lines new_lines=lines
) )
@@ -106,7 +108,11 @@ class Grammar(object):
tokens = self._tokenizer(lines) tokens = self._tokenizer(lines)
p = self._parser(self._pgen_grammar, error_recovery=error_recovery, start_symbol=start_symbol) p = self._parser(
self._pgen_grammar,
error_recovery=error_recovery,
start_symbol=start_symbol
)
root_node = p.parse(tokens=tokens) root_node = p.parse(tokens=tokens)
if cache or diff_cache: if cache or diff_cache:
@@ -120,6 +126,20 @@ class Grammar(object):
return '<%s:%s>' % (self.__class__.__name__, txt) return '<%s:%s>' % (self.__class__.__name__, txt)
class PythonGrammar(Grammar):
def __init__(self, version_int, bnf_text):
super(PythonGrammar, self).__init__(
bnf_text,
tokenizer=self._tokenize_lines,
parser=PythonParser,
diff_parser=DiffParser
)
self._version_int = version_int
def _tokenize_lines(self, lines):
return tokenize_lines(lines, self._version_int)
def load_grammar(version=None): def load_grammar(version=None):
""" """
Loads a Python grammar. The default version is the current Python version. Loads a Python grammar. The default version is the current Python version.
@@ -147,12 +167,7 @@ def load_grammar(version=None):
with open(path) as f: with open(path) as f:
bnf_text = f.read() bnf_text = f.read()
grammar = Grammar( grammar = PythonGrammar(version_int, bnf_text)
bnf_text,
tokenizer=tokenize_lines,
parser=PythonParser,
diff_parser=DiffParser
)
return _loaded_grammars.setdefault(path, grammar) return _loaded_grammars.setdefault(path, grammar)
except FileNotFoundError: except FileNotFoundError:
message = "Python version %s is currently not supported." % version message = "Python version %s is currently not supported." % version

View File

@@ -13,7 +13,7 @@ from parso.python import tokenize
class ParserGenerator(object): class ParserGenerator(object):
def __init__(self, bnf_text): def __init__(self, bnf_text):
self._bnf_text = bnf_text self._bnf_text = bnf_text
self.generator = tokenize.tokenize(bnf_text) self.generator = tokenize.tokenize(bnf_text, version_int=36)
self._gettoken() # Initialize lookahead self._gettoken() # Initialize lookahead
self.dfas, self.startsymbol = self._parse() self.dfas, self.startsymbol = self._parse()
self.first = {} # map from symbol name to set of tokens self.first = {} # map from symbol name to set of tokens

View File

@@ -13,7 +13,7 @@ import logging
from parso.utils import splitlines from parso.utils import splitlines
from parso.python.parser import Parser from parso.python.parser import Parser
from parso.python.tree import EndMarker from parso.python.tree import EndMarker
from parso.python.tokenize import (tokenize_lines, NEWLINE, TokenInfo, from parso.python.tokenize import (NEWLINE, TokenInfo,
ENDMARKER, INDENT, DEDENT, ERRORTOKEN) ENDMARKER, INDENT, DEDENT, ERRORTOKEN)
@@ -89,8 +89,9 @@ class DiffParser(object):
An advanced form of parsing a file faster. Unfortunately comes with huge An advanced form of parsing a file faster. Unfortunately comes with huge
side effects. It changes the given module. side effects. It changes the given module.
""" """
def __init__(self, pgen_grammar, module): def __init__(self, pgen_grammar, tokenizer, module):
self._pgen_grammar = pgen_grammar self._pgen_grammar = pgen_grammar
self._tokenizer = tokenizer
self._module = module self._module = module
def _reset(self): def _reset(self):
@@ -286,7 +287,7 @@ class DiffParser(object):
is_first_token = True is_first_token = True
omitted_first_indent = False omitted_first_indent = False
indents = [] indents = []
tokens = tokenize_lines(lines) tokens = self._tokenizer(lines)
stack = self._active_parser.pgen_parser.stack stack = self._active_parser.pgen_parser.stack
for typ, string, start_pos, prefix in tokens: for typ, string, start_pos, prefix in tokens:
start_pos = start_pos[0] + line_offset, start_pos[1] start_pos = start_pos[0] + line_offset, start_pos[1]

View File

@@ -11,6 +11,7 @@ memory optimizations here.
""" """
from __future__ import absolute_import from __future__ import absolute_import
import sys
import string import string
import re import re
from collections import namedtuple from collections import namedtuple
@@ -19,12 +20,19 @@ from codecs import BOM_UTF8
from parso.python.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap, from parso.python.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT) NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
from parso._compatibility import py_version, u from parso._compatibility import py_version
from parso.utils import splitlines from parso.utils import splitlines
TokenCollection = namedtuple(
'TokenCollection',
'pseudo_token single_quoted triple_quoted endpats always_break_tokens',
)
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
_token_collection_cache = {}
if py_version >= 30: if py_version >= 30:
# Python 3 has str.isidentifier() to check if a char is a valid identifier # Python 3 has str.isidentifier() to check if a char is a valid identifier
is_identifier = str.isidentifier is_identifier = str.isidentifier
@@ -46,55 +54,24 @@ def group(*choices, **kwargs):
start += '?:' start += '?:'
return start + '|'.join(choices) + ')' return start + '|'.join(choices) + ')'
def any(*choices): def any(*choices):
return group(*choices) + '*' return group(*choices) + '*'
def maybe(*choices): def maybe(*choices):
return group(*choices) + '?' return group(*choices) + '?'
# Note: we use unicode matching for names ("\w") but ascii matching for
# number literals.
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Name = r'\w+'
if py_version >= 36:
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
Binnumber = r'0[bB](?:_?[01])+'
Octnumber = r'0[oO](?:_?[0-7])+'
Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
else:
Hexnumber = r'0[xX][0-9a-fA-F]+'
Binnumber = r'0[bB][01]+'
if py_version >= 30:
Octnumber = r'0[oO][0-7]+'
else:
Octnumber = '0[0-7]+'
Decnumber = r'(?:0+|[1-9][0-9]*)'
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Exponent = r'[eE][-+]?[0-9]+'
Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
Expfloat = r'[0-9]+' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber)
# Return the empty string, plus all of the valid string prefixes. # Return the empty string, plus all of the valid string prefixes.
def _all_string_prefixes(): def _all_string_prefixes(version_int):
# The valid string prefixes. Only contain the lower case versions, # The valid string prefixes. Only contain the lower case versions,
# and don't contain any permuations (include 'fr', but not # and don't contain any permuations (include 'fr', but not
# 'rf'). The various permutations will be generated. # 'rf'). The various permutations will be generated.
_valid_string_prefixes = ['b', 'r', 'u', 'br'] _valid_string_prefixes = ['b', 'r', 'u', 'br']
if py_version >= 36: if version_int >= 36:
_valid_string_prefixes += ['f', 'fr'] _valid_string_prefixes += ['f', 'fr']
if py_version <= 27: if version_int <= 27:
# TODO this is actually not 100% valid. ur is valid in Python 2.7, # TODO this is actually not 100% valid. ur is valid in Python 2.7,
# while ru is not. # while ru is not.
_valid_string_prefixes.append('ur') _valid_string_prefixes.append('ur')
@@ -109,12 +86,59 @@ def _all_string_prefixes():
result.add(''.join(s)) result.add(''.join(s))
return result return result
def _compile(expr): def _compile(expr):
return re.compile(expr, re.UNICODE) return re.compile(expr, re.UNICODE)
def _get_token_collection(version_int):
try:
return _token_collection_cache[version_int]
except KeyError:
_token_collection_cache[version_int] = result = \
_create_token_collection(version_int)
return result
def _create_token_collection(version_int):
# Note: we use unicode matching for names ("\w") but ascii matching for
# number literals.
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Name = r'\w+'
if version_int >= 36:
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
Binnumber = r'0[bB](?:_?[01])+'
Octnumber = r'0[oO](?:_?[0-7])+'
Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
else:
Hexnumber = r'0[xX][0-9a-fA-F]+'
Binnumber = r'0[bB][01]+'
if version_int >= 30:
Octnumber = r'0[oO][0-7]+'
else:
Octnumber = '0[0-7]+'
Decnumber = r'(?:0+|[1-9][0-9]*)'
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Exponent = r'[eE][-+]?[0-9]+'
Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
Expfloat = r'[0-9]+' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber)
# Note that since _all_string_prefixes includes the empty string, # Note that since _all_string_prefixes includes the empty string,
# StringPrefix can be the empty string (making it optional). # StringPrefix can be the empty string (making it optional).
StringPrefix = group(*_all_string_prefixes()) possible_prefixes = _all_string_prefixes(version_int)
StringPrefix = group(*possible_prefixes)
# Tail end of ' string. # Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'" Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
@@ -138,8 +162,6 @@ Bracket = '[][(){}]'
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Funny = group(Operator, Bracket, Special) Funny = group(Operator, Bracket, Special)
PlainToken = group(Number, Funny, Name, capture=True)
# First (or only) line of ' or " string. # First (or only) line of ' or " string.
ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
group("'", r'\\\r?\n'), group("'", r'\\\r?\n'),
@@ -153,7 +175,7 @@ PseudoToken = group(Whitespace, capture=True) + \
# to match the remainder of that string. _prefix can be empty, for # to match the remainder of that string. _prefix can be empty, for
# a normal single or triple quoted string (with no prefix). # a normal single or triple quoted string (with no prefix).
endpats = {} endpats = {}
for _prefix in _all_string_prefixes(): for _prefix in possible_prefixes:
endpats[_prefix + "'"] = _compile(Single) endpats[_prefix + "'"] = _compile(Single)
endpats[_prefix + '"'] = _compile(Double) endpats[_prefix + '"'] = _compile(Double)
endpats[_prefix + "'''"] = _compile(Single3) endpats[_prefix + "'''"] = _compile(Single3)
@@ -163,16 +185,19 @@ for _prefix in _all_string_prefixes():
# including the opening quotes. # including the opening quotes.
single_quoted = set() single_quoted = set()
triple_quoted = set() triple_quoted = set()
for t in _all_string_prefixes(): for t in possible_prefixes:
for p in (t + '"', t + "'"): for p in (t + '"', t + "'"):
single_quoted.add(p) single_quoted.add(p)
for p in (t + '"""', t + "'''"): for p in (t + '"""', t + "'''"):
triple_quoted.add(p) triple_quoted.add(p)
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
'finally', 'while', 'with', 'return') 'finally', 'while', 'with', 'return')
pseudo_token_compiled = _compile(PseudoToken) pseudo_token_compiled = _compile(PseudoToken)
return TokenCollection(
pseudo_token_compiled, single_quoted, triple_quoted, endpats,
ALWAYS_BREAK_TOKENS
)
class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])): class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
@@ -203,13 +228,13 @@ class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
return self.start_pos[0], self.start_pos[1] + len(self.string) return self.start_pos[0], self.start_pos[1] + len(self.string)
def tokenize(code): def tokenize(code, version_int):
"""Generate tokens from a the source code (string).""" """Generate tokens from a the source code (string)."""
lines = splitlines(code, keepends=True) lines = splitlines(code, keepends=True)
return tokenize_lines(lines) return tokenize_lines(lines, version_int)
def tokenize_lines(lines): def tokenize_lines(lines, version_int):
""" """
A heavily modified Python standard library tokenizer. A heavily modified Python standard library tokenizer.
@@ -217,6 +242,8 @@ def tokenize_lines(lines):
token. This idea comes from lib2to3. The prefix contains all information token. This idea comes from lib2to3. The prefix contains all information
that is irrelevant for the parser like newlines in parentheses or comments. that is irrelevant for the parser like newlines in parentheses or comments.
""" """
pseudo_token, single_quoted, triple_quoted, endpats, always_break_tokens, = \
_get_token_collection(version_int)
paren_level = 0 # count parentheses paren_level = 0 # count parentheses
indents = [0] indents = [0]
max = 0 max = 0
@@ -252,7 +279,7 @@ def tokenize_lines(lines):
continue continue
while pos < max: while pos < max:
pseudomatch = pseudo_token_compiled.match(line, pos) pseudomatch = pseudo_token.match(line, pos)
if not pseudomatch: # scan for tokens if not pseudomatch: # scan for tokens
txt = line[pos:] txt = line[pos:]
if txt.endswith('\n'): if txt.endswith('\n'):
@@ -329,7 +356,7 @@ def tokenize_lines(lines):
else: # ordinary string else: # ordinary string
yield TokenInfo(STRING, token, spos, prefix) yield TokenInfo(STRING, token, spos, prefix)
elif is_identifier(initial): # ordinary name elif is_identifier(initial): # ordinary name
if token in ALWAYS_BREAK_TOKENS: if token in always_break_tokens:
paren_level = 0 paren_level = 0
while True: while True:
indent = indents.pop() indent = indents.pop()
@@ -370,12 +397,16 @@ def tokenize_lines(lines):
if __name__ == "__main__": if __name__ == "__main__":
import sys
if len(sys.argv) >= 2: if len(sys.argv) >= 2:
path = sys.argv[1] path = sys.argv[1]
with open(path) as f: with open(path) as f:
code = u(f.read()) code = f.read()
else: else:
code = u(sys.stdin.read()) code = sys.stdin.read()
if isinstance(code, bytes):
from parso.utils import source_to_unicode
code = source_to_unicode(code)
for token in tokenize(code): for token in tokenize(code):
print(token) print(token)

View File

@@ -108,7 +108,7 @@ def _parse_version(version):
return int(major + minor) return int(major + minor)
def version_string_to_int(version): def version_string_to_int(version=None):
""" """
Checks for a valid version number (e.g. `3.2` or `2.7.1` or `3`) and Checks for a valid version number (e.g. `3.2` or `2.7.1` or `3`) and
returns a corresponding int that is always two characters long in decimal. returns a corresponding int that is always two characters long in decimal.

View File

@@ -65,7 +65,11 @@ class Differ(object):
def parse(self, code, copies=0, parsers=0, expect_error_leaves=False): def parse(self, code, copies=0, parsers=0, expect_error_leaves=False):
logging.debug('differ: parse copies=%s parsers=%s', copies, parsers) logging.debug('differ: parse copies=%s parsers=%s', copies, parsers)
lines = splitlines(code, keepends=True) lines = splitlines(code, keepends=True)
diff_parser = DiffParser(self.grammar._pgen_grammar, self.module) diff_parser = DiffParser(
self.grammar._pgen_grammar,
self.grammar._tokenizer,
self.module,
)
new_module = diff_parser.update(self.lines, lines) new_module = diff_parser.update(self.lines, lines)
self.lines = lines self.lines = lines
assert code == new_module.get_code() assert code == new_module.get_code()

View File

@@ -5,7 +5,7 @@ tests of pydocstyle.
import difflib import difflib
import re import re
from _compatibility import total_ordering from test._compatibility import total_ordering
import parso import parso
from parso.utils import source_to_unicode from parso.utils import source_to_unicode

View File

@@ -5,7 +5,7 @@ from textwrap import dedent
import pytest import pytest
from parso._compatibility import py_version from parso._compatibility import py_version
from parso.utils import splitlines from parso.utils import splitlines, version_string_to_int
from parso.python.token import ( from parso.python.token import (
NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER) NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER)
from parso.python import tokenize from parso.python import tokenize
@@ -14,7 +14,9 @@ from parso.python.tokenize import TokenInfo
def _get_token_list(string): def _get_token_list(string):
return list(tokenize.tokenize(string)) # Load the current version.
version_int = version_string_to_int()
return list(tokenize.tokenize(string, version_int))
def test_end_pos_one_line(): def test_end_pos_one_line():
@@ -41,8 +43,7 @@ def test_end_pos_multi_line():
def test_simple_no_whitespace(): def test_simple_no_whitespace():
# Test a simple one line string, no preceding whitespace # Test a simple one line string, no preceding whitespace
simple_docstring = '"""simple one line docstring"""' simple_docstring = '"""simple one line docstring"""'
tokens = tokenize.tokenize(simple_docstring) token_list = _get_token_list(simple_docstring)
token_list = list(tokens)
_, value, _, prefix = token_list[0] _, value, _, prefix = token_list[0]
assert prefix == '' assert prefix == ''
assert value == '"""simple one line docstring"""' assert value == '"""simple one line docstring"""'
@@ -51,8 +52,7 @@ def test_simple_no_whitespace():
def test_simple_with_whitespace(): def test_simple_with_whitespace():
# Test a simple one line string with preceding whitespace and newline # Test a simple one line string with preceding whitespace and newline
simple_docstring = ' """simple one line docstring""" \r\n' simple_docstring = ' """simple one line docstring""" \r\n'
tokens = tokenize.tokenize(simple_docstring) token_list = _get_token_list(simple_docstring)
token_list = list(tokens)
assert token_list[0][0] == INDENT assert token_list[0][0] == INDENT
typ, value, start_pos, prefix = token_list[1] typ, value, start_pos, prefix = token_list[1]
assert prefix == ' ' assert prefix == ' '
@@ -71,8 +71,7 @@ def test_function_whitespace():
if x > 0: if x > 0:
print(True) print(True)
''') ''')
tokens = tokenize.tokenize(fundef) token_list = _get_token_list(fundef)
token_list = list(tokens)
for _, value, _, prefix in token_list: for _, value, _, prefix in token_list:
if value == 'test_whitespace': if value == 'test_whitespace':
assert prefix == ' ' assert prefix == ' '
@@ -92,8 +91,7 @@ def test_tokenize_multiline_I():
# Make sure multiline string having newlines have the end marker on the # Make sure multiline string having newlines have the end marker on the
# next line # next line
fundef = '''""""\n''' fundef = '''""""\n'''
tokens = tokenize.tokenize(fundef) token_list = _get_token_list(fundef)
token_list = list(tokens)
assert token_list == [TokenInfo(ERRORTOKEN, '""""\n', (1, 0), ''), assert token_list == [TokenInfo(ERRORTOKEN, '""""\n', (1, 0), ''),
TokenInfo(ENDMARKER , '', (2, 0), '')] TokenInfo(ENDMARKER , '', (2, 0), '')]
@@ -102,8 +100,7 @@ def test_tokenize_multiline_II():
# Make sure multiline string having no newlines have the end marker on # Make sure multiline string having no newlines have the end marker on
# same line # same line
fundef = '''""""''' fundef = '''""""'''
tokens = tokenize.tokenize(fundef) token_list = _get_token_list(fundef)
token_list = list(tokens)
assert token_list == [TokenInfo(ERRORTOKEN, '""""', (1, 0), ''), assert token_list == [TokenInfo(ERRORTOKEN, '""""', (1, 0), ''),
TokenInfo(ENDMARKER, '', (1, 4), '')] TokenInfo(ENDMARKER, '', (1, 4), '')]
@@ -112,8 +109,7 @@ def test_tokenize_multiline_III():
# Make sure multiline string having newlines have the end marker on the # Make sure multiline string having newlines have the end marker on the
# next line even if several newline # next line even if several newline
fundef = '''""""\n\n''' fundef = '''""""\n\n'''
tokens = tokenize.tokenize(fundef) token_list = _get_token_list(fundef)
token_list = list(tokens)
assert token_list == [TokenInfo(ERRORTOKEN, '""""\n\n', (1, 0), ''), assert token_list == [TokenInfo(ERRORTOKEN, '""""\n\n', (1, 0), ''),
TokenInfo(ENDMARKER, '', (3, 0), '')] TokenInfo(ENDMARKER, '', (3, 0), '')]
@@ -123,8 +119,7 @@ def test_identifier_contains_unicode():
def 我あφ(): def 我あφ():
pass pass
''') ''')
tokens = tokenize.tokenize(fundef) token_list = _get_token_list(fundef)
token_list = list(tokens)
unicode_token = token_list[1] unicode_token = token_list[1]
if py_version >= 30: if py_version >= 30:
assert unicode_token[0] == NAME assert unicode_token[0] == NAME