Remove the old f-string grammar and fix the tests with the new syntax

This commit is contained in:
Dave Halter
2018-04-07 02:11:26 +02:00
parent 9941348ec6
commit 8f1a436ba1
5 changed files with 66 additions and 293 deletions

View File

@@ -12,7 +12,6 @@ from parso.parser import BaseParser
from parso.python.parser import Parser as PythonParser from parso.python.parser import Parser as PythonParser
from parso.python.errors import ErrorFinderConfig from parso.python.errors import ErrorFinderConfig
from parso.python import pep8 from parso.python import pep8
from parso.python import fstring
_loaded_grammars = {} _loaded_grammars = {}
@@ -186,7 +185,6 @@ class Grammar(object):
normalizer.walk(node) normalizer.walk(node)
return normalizer.issues return normalizer.issues
def __repr__(self): def __repr__(self):
labels = self._pgen_grammar.number2symbol.values() labels = self._pgen_grammar.number2symbol.values()
txt = ' '.join(list(labels)[:3]) + ' ...' txt = ' '.join(list(labels)[:3]) + ' ...'
@@ -215,34 +213,6 @@ class PythonGrammar(Grammar):
return tokenize(code, self.version_info) return tokenize(code, self.version_info)
class PythonFStringGrammar(Grammar):
_token_namespace = fstring.TokenNamespace
_start_symbol = 'fstring'
def __init__(self):
super(PythonFStringGrammar, self).__init__(
text=fstring.GRAMMAR,
tokenizer=fstring.tokenize,
parser=fstring.Parser
)
def parse(self, code, **kwargs):
return self._parse(code, **kwargs)
def _parse(self, code, error_recovery=True, start_pos=(1, 0)):
tokens = self._tokenizer(code, start_pos=start_pos)
p = self._parser(
self._pgen_grammar,
error_recovery=error_recovery,
start_symbol=self._start_symbol,
)
return p.parse(tokens=tokens)
def parse_leaf(self, leaf, error_recovery=True):
code = leaf._get_payload()
return self.parse(code, error_recovery=True, start_pos=leaf.start_pos)
def load_grammar(**kwargs): def load_grammar(**kwargs):
""" """
Loads a :py:class:`parso.Grammar`. The default version is the current Python Loads a :py:class:`parso.Grammar`. The default version is the current Python
@@ -273,10 +243,6 @@ def load_grammar(**kwargs):
except FileNotFoundError: except FileNotFoundError:
message = "Python version %s is currently not supported." % version message = "Python version %s is currently not supported." % version
raise NotImplementedError(message) raise NotImplementedError(message)
elif language == 'python-f-string':
if version is not None:
raise NotImplementedError("Currently different versions are not supported.")
return PythonFStringGrammar()
else: else:
raise NotImplementedError("No support for language %s." % language) raise NotImplementedError("No support for language %s." % language)

View File

@@ -124,7 +124,9 @@ class PgenParser(object):
self.error_recovery = error_recovery self.error_recovery = error_recovery
def parse(self, tokens): def parse(self, tokens):
for type_, value, start_pos, prefix in tokens: for tok in tokens:
print(tok)
type_, value, start_pos, prefix = tok
if self.add_token(type_, value, start_pos, prefix): if self.add_token(type_, value, start_pos, prefix):
break break
else: else:

View File

@@ -1,211 +0,0 @@
import re
from itertools import count
from parso.utils import PythonVersionInfo
from parso.utils import split_lines
from parso.python.tokenize import Token
from parso import parser
from parso.tree import TypedLeaf, ErrorNode, ErrorLeaf
version36 = PythonVersionInfo(3, 6)
class TokenNamespace:
_c = count()
LBRACE = next(_c)
RBRACE = next(_c)
ENDMARKER = next(_c)
COLON = next(_c)
CONVERSION = next(_c)
PYTHON_EXPR = next(_c)
EXCLAMATION_MARK = next(_c)
UNTERMINATED_STRING = next(_c)
token_map = dict((v, k) for k, v in locals().items() if not k.startswith('_'))
@classmethod
def generate_token_id(cls, string):
if string == '{':
return cls.LBRACE
elif string == '}':
return cls.RBRACE
elif string == '!':
return cls.EXCLAMATION_MARK
elif string == ':':
return cls.COLON
return getattr(cls, string)
GRAMMAR = """
fstring: expression* ENDMARKER
format_spec: ':' expression*
expression: '{' PYTHON_EXPR [ '!' CONVERSION ] [ format_spec ] '}'
"""
_prefix = r'((?:[^{}]+)*)'
_expr = _prefix + r'(\{|\}|$)'
_in_expr = r'([^{}\[\]:"\'!]*)(.?)'
# There's only one conversion character allowed. But the rules have to be
# checked later anyway, so allow more here. This makes error recovery nicer.
_conversion = r'([^={}:]*)(.?)'
_compiled_expr = re.compile(_expr)
_compiled_in_expr = re.compile(_in_expr)
_compiled_conversion = re.compile(_conversion)
def tokenize(code, start_pos=(1, 0)):
def add_to_pos(string):
lines = split_lines(string)
l = len(lines[-1])
if len(lines) > 1:
start_pos[0] += len(lines) - 1
start_pos[1] = l
else:
start_pos[1] += l
def tok(value, type=None, prefix=''):
if type is None:
type = TokenNamespace.generate_token_id(value)
add_to_pos(prefix)
token = Token(type, value, tuple(start_pos), prefix)
add_to_pos(value)
return token
start = 0
recursion_level = 0
added_prefix = ''
start_pos = list(start_pos)
while True:
match = _compiled_expr.match(code, start)
prefix = added_prefix + match.group(1)
found = match.group(2)
start = match.end()
if not found:
# We're at the end.
break
if found == '}':
if recursion_level == 0 and len(code) > start and code[start] == '}':
# This is a }} escape.
added_prefix = prefix + '}}'
start += 1
continue
recursion_level = max(0, recursion_level - 1)
yield tok(found, prefix=prefix)
added_prefix = ''
else:
assert found == '{'
if recursion_level == 0 and len(code) > start and code[start] == '{':
# This is a {{ escape.
added_prefix = prefix + '{{'
start += 1
continue
recursion_level += 1
yield tok(found, prefix=prefix)
added_prefix = ''
expression = ''
squared_count = 0
curly_count = 0
while True:
expr_match = _compiled_in_expr.match(code, start)
expression += expr_match.group(1)
found = expr_match.group(2)
start = expr_match.end()
if found == '{':
curly_count += 1
expression += found
elif found == '}' and curly_count > 0:
curly_count -= 1
expression += found
elif found == '[':
squared_count += 1
expression += found
elif found == ']':
# Use a max function here, because the Python code might
# just have syntax errors.
squared_count = max(0, squared_count - 1)
expression += found
elif found == ':' and (squared_count or curly_count):
expression += found
elif found in ('"', "'"):
search = found
if len(code) > start + 1 and \
code[start] == found == code[start+1]:
search *= 3
start += 2
index = code.find(search, start)
if index == -1:
yield tok(expression, type=TokenNamespace.PYTHON_EXPR)
yield tok(
found + code[start:],
type=TokenNamespace.UNTERMINATED_STRING,
)
start = len(code)
break
expression += found + code[start:index+1]
start = index + 1
elif found == '!' and len(code) > start and code[start] == '=':
# This is a python `!=` and not a conversion.
expression += found
else:
yield tok(expression, type=TokenNamespace.PYTHON_EXPR)
if found:
yield tok(found)
break
if found == '!':
conversion_match = _compiled_conversion.match(code, start)
found = conversion_match.group(2)
start = conversion_match.end()
yield tok(conversion_match.group(1), type=TokenNamespace.CONVERSION)
if found:
yield tok(found)
if found == '}':
recursion_level -= 1
# We don't need to handle everything after ':', because that is
# basically new tokens.
yield tok('', type=TokenNamespace.ENDMARKER, prefix=prefix)
class Parser(parser.BaseParser):
def parse(self, tokens):
node = super(Parser, self).parse(tokens)
if isinstance(node, self.default_leaf): # Is an endmarker.
# If there's no curly braces we get back a non-module. We always
# want an fstring.
node = self.default_node('fstring', [node])
return node
def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos):
# TODO this is so ugly.
leaf_type = TokenNamespace.token_map[type].lower()
return TypedLeaf(leaf_type, value, start_pos, prefix)
def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
add_token_callback):
if not self._error_recovery:
return super(Parser, self).error_recovery(
pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
add_token_callback
)
token_type = TokenNamespace.token_map[typ].lower()
if len(stack) == 1:
error_leaf = ErrorLeaf(token_type, value, start_pos, prefix)
stack[0][2][1].append(error_leaf)
else:
dfa, state, (type_, nodes) = stack[1]
stack[0][2][1].append(ErrorNode(nodes))
stack[1:] = []
add_token_callback(typ, value, start_pos, prefix)

View File

@@ -106,8 +106,8 @@ def _get_token_collection(version_info):
return result return result
fstring_string_single_line = _compile(r'(?:[^{\r\n]+|\{\{)+') fstring_string_single_line = _compile(r'(?:[^{}\r\n]+|\{\{|\}\})+')
fstring_string_multi_line = _compile(r'(?:[^{]+|\{\{)+') fstring_string_multi_line = _compile(r'(?:[^{}]+|\{\{|\}\})+')
def _create_token_collection(version_info): def _create_token_collection(version_info):
@@ -253,7 +253,10 @@ class FStringNode(object):
self.quote = quote self.quote = quote
self.parentheses_count = 0 self.parentheses_count = 0
self.previous_lines = '' self.previous_lines = ''
self.in_format_spec = False self.last_string_start_pos = None
# In the syntax there can be multiple format_spec's nested:
# {x:{y:3}}
self.format_spec_count = 0
def open_parentheses(self, character): def open_parentheses(self, character):
self.parentheses_count += 1 self.parentheses_count += 1
@@ -265,7 +268,7 @@ class FStringNode(object):
return len(self.quote) == 3 return len(self.quote) == 3
def is_in_expr(self): def is_in_expr(self):
return self.parentheses_count return (self.parentheses_count - self.format_spec_count) > 0
def _check_fstring_ending(fstring_stack, token, from_start=False): def _check_fstring_ending(fstring_stack, token, from_start=False):
@@ -290,7 +293,7 @@ def _check_fstring_ending(fstring_stack, token, from_start=False):
return fstring_index, fstring_end return fstring_index, fstring_end
def _find_fstring_string(fstring_stack, line, pos): def _find_fstring_string(fstring_stack, line, lnum, pos):
tos = fstring_stack[-1] tos = fstring_stack[-1]
if tos.is_in_expr(): if tos.is_in_expr():
return '', pos return '', pos
@@ -302,8 +305,12 @@ def _find_fstring_string(fstring_stack, line, pos):
else: else:
match = fstring_string_single_line.match(line, pos) match = fstring_string_single_line.match(line, pos)
if match is None: if match is None:
string = fstring_stack[-1].previous_lines string = tos.previous_lines
else: else:
print(match, lnum, pos, repr(tos.previous_lines))
if not tos.previous_lines:
tos.last_string_start_pos = (lnum, pos)
string = match.group(0) string = match.group(0)
for fstring_stack_node in fstring_stack: for fstring_stack_node in fstring_stack:
try: try:
@@ -313,12 +320,11 @@ def _find_fstring_string(fstring_stack, line, pos):
new_pos += len(string) new_pos += len(string)
if allow_multiline and string.endswith('\n'): if allow_multiline and string.endswith('\n'):
fstring_stack[-1].previous_lines += string tos.previous_lines += string
string = '' string = ''
else: else:
string = fstring_stack[-1].previous_lines + string string = tos.previous_lines + string
fstring_stack[-1].previous_lines = ''
return string, new_pos return string, new_pos
@@ -385,25 +391,31 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
while pos < max: while pos < max:
if fstring_stack: if fstring_stack:
string, pos = _find_fstring_string(fstring_stack, line, pos) string, pos = _find_fstring_string(fstring_stack, line, lnum, pos)
if string: if string:
yield PythonToken(FSTRING_STRING, string, (lnum, pos), '') yield PythonToken(
FSTRING_STRING, string,
fstring_stack[-1].last_string_start_pos, ''
)
fstring_stack[-1].previous_lines = ''
continue continue
if pos < max: if pos == max:
rest = line[pos:] break
fstring_index, end = _check_fstring_ending(fstring_stack, rest, from_start=True)
if fstring_index is not None: rest = line[pos:]
yield PythonToken( fstring_index, end = _check_fstring_ending(fstring_stack, rest, from_start=True)
FSTRING_END,
fstring_stack[fstring_index].quote, if fstring_index is not None:
(lnum, pos), yield PythonToken(
prefix='' FSTRING_END,
) fstring_stack[fstring_index].quote,
del fstring_stack[fstring_index:] (lnum, pos),
pos += end prefix=''
continue )
del fstring_stack[fstring_index:]
pos += end
continue
pseudomatch = pseudo_token.match(line, pos) pseudomatch = pseudo_token.match(line, pos)
if not pseudomatch: # scan for tokens if not pseudomatch: # scan for tokens
@@ -531,7 +543,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
paren_level -= 1 paren_level -= 1
elif token == ':' and fstring_stack \ elif token == ':' and fstring_stack \
and fstring_stack[-1].parentheses_count == 1: and fstring_stack[-1].parentheses_count == 1:
fstring_stack[-1].in_format_spec = True fstring_stack[-1].format_spec_count += 1
try: try:
# This check is needed in any case to check if it's a valid # This check is needed in any case to check if it's a valid

View File

@@ -1,17 +1,18 @@
import pytest import pytest
from parso import load_grammar, ParserSyntaxError from parso import load_grammar, ParserSyntaxError
from parso.python.fstring import tokenize from parso.python.tokenize import tokenize
@pytest.fixture @pytest.fixture
def grammar(): def grammar():
return load_grammar(language="python-f-string") return load_grammar(version='3.6')
@pytest.mark.parametrize( @pytest.mark.parametrize(
'code', [ 'code', [
'{1}', '{1}',
'{1:}',
'', '',
'{1!a}', '{1!a}',
'{1!a:1}', '{1!a:1}',
@@ -26,22 +27,12 @@ def grammar():
'{{{1}', '{{{1}',
'1{{2{{3', '1{{2{{3',
'}}', '}}',
'{:}}}',
# Invalid, but will be checked, later.
'{}',
'{1:}',
'{:}',
'{:1}',
'{!:}',
'{!}',
'{!a}',
'{1:{}}',
'{1:{:}}',
] ]
) )
def test_valid(code, grammar): def test_valid(code, grammar):
fstring = grammar.parse(code, error_recovery=False) code = 'f"""%s"""' % code
module = grammar.parse(code, error_recovery=False)
fstring = module.children[0]
assert fstring.type == 'fstring' assert fstring.type == 'fstring'
assert fstring.get_code() == code assert fstring.get_code() == code
@@ -52,24 +43,37 @@ def test_valid(code, grammar):
'{', '{',
'{1!{a}}', '{1!{a}}',
'{!{a}}', '{!{a}}',
'{}',
'{:}',
'{:}}}',
'{:1}',
'{!:}',
'{!}',
'{!a}',
'{1:{}}',
'{1:{:}}',
] ]
) )
def test_invalid(code, grammar): def test_invalid(code, grammar):
code = 'f"""%s"""' % code
with pytest.raises(ParserSyntaxError): with pytest.raises(ParserSyntaxError):
grammar.parse(code, error_recovery=False) grammar.parse(code, error_recovery=False)
# It should work with error recovery. # It should work with error recovery.
#grammar.parse(code, error_recovery=True) grammar.parse(code, error_recovery=True)
@pytest.mark.parametrize( @pytest.mark.parametrize(
('code', 'start_pos', 'positions'), [ ('code', 'positions'), [
# 2 times 2, 5 because python expr and endmarker. # 2 times 2, 5 because python expr and endmarker.
('}{', (2, 3), [(2, 3), (2, 4), (2, 5), (2, 5)]), ('f"}{"', [(1, 0), (1, 2), (1, 3), (1, 4), (1, 5)]),
(' :{ 1 : } ', (1, 0), [(1, 2), (1, 3), (1, 6), (1, 8), (1, 10)]), ('f" :{ 1 : } "', [(1, 0), (1, 2), (1, 4), (1, 6), (1, 8), (1, 9),
('\n{\nfoo\n }', (2, 1), [(3, 0), (3, 1), (5, 1), (5, 2)]), (1, 10), (1, 11), (1, 12), (1, 13)]),
('f"""\n {\nfoo\n }"""', [(1, 0), (1, 4), (2, 1), (3, 0), (4, 1),
(4, 2), (4, 5)]),
] ]
) )
def test_tokenize_start_pos(code, start_pos, positions): def test_tokenize_start_pos(code, positions):
tokens = tokenize(code, start_pos) tokens = list(tokenize(code, version_info=(3, 6)))
print(tokens)
assert positions == [p.start_pos for p in tokens] assert positions == [p.start_pos for p in tokens]