diff --git a/parso/grammar.py b/parso/grammar.py index e5abf81..c825b55 100644 --- a/parso/grammar.py +++ b/parso/grammar.py @@ -12,7 +12,6 @@ from parso.parser import BaseParser from parso.python.parser import Parser as PythonParser from parso.python.errors import ErrorFinderConfig from parso.python import pep8 -from parso.python import fstring _loaded_grammars = {} @@ -186,7 +185,6 @@ class Grammar(object): normalizer.walk(node) return normalizer.issues - def __repr__(self): labels = self._pgen_grammar.number2symbol.values() txt = ' '.join(list(labels)[:3]) + ' ...' @@ -215,34 +213,6 @@ class PythonGrammar(Grammar): return tokenize(code, self.version_info) -class PythonFStringGrammar(Grammar): - _token_namespace = fstring.TokenNamespace - _start_symbol = 'fstring' - - def __init__(self): - super(PythonFStringGrammar, self).__init__( - text=fstring.GRAMMAR, - tokenizer=fstring.tokenize, - parser=fstring.Parser - ) - - def parse(self, code, **kwargs): - return self._parse(code, **kwargs) - - def _parse(self, code, error_recovery=True, start_pos=(1, 0)): - tokens = self._tokenizer(code, start_pos=start_pos) - p = self._parser( - self._pgen_grammar, - error_recovery=error_recovery, - start_symbol=self._start_symbol, - ) - return p.parse(tokens=tokens) - - def parse_leaf(self, leaf, error_recovery=True): - code = leaf._get_payload() - return self.parse(code, error_recovery=True, start_pos=leaf.start_pos) - - def load_grammar(**kwargs): """ Loads a :py:class:`parso.Grammar`. The default version is the current Python @@ -273,10 +243,6 @@ def load_grammar(**kwargs): except FileNotFoundError: message = "Python version %s is currently not supported." % version raise NotImplementedError(message) - elif language == 'python-f-string': - if version is not None: - raise NotImplementedError("Currently different versions are not supported.") - return PythonFStringGrammar() else: raise NotImplementedError("No support for language %s." % language) diff --git a/parso/pgen2/parse.py b/parso/pgen2/parse.py index aaacfce..e2369d1 100644 --- a/parso/pgen2/parse.py +++ b/parso/pgen2/parse.py @@ -124,7 +124,9 @@ class PgenParser(object): self.error_recovery = error_recovery def parse(self, tokens): - for type_, value, start_pos, prefix in tokens: + for tok in tokens: + print(tok) + type_, value, start_pos, prefix = tok if self.add_token(type_, value, start_pos, prefix): break else: diff --git a/parso/pgen2/pgen.py b/parso/pgen2/pgen.py index 10ef6ff..a3e39fa 100644 --- a/parso/pgen2/pgen.py +++ b/parso/pgen2/pgen.py @@ -28,6 +28,7 @@ class ParserGenerator(object): c = grammar.Grammar(self._bnf_text) names = list(self.dfas.keys()) names.sort() + # TODO do we still need this? names.remove(self.startsymbol) names.insert(0, self.startsymbol) for name in names: @@ -316,8 +317,8 @@ class ParserGenerator(object): def _expect(self, type): if self.type != type: - self._raise_error("expected %s, got %s(%s)", - type, self.type, self.value) + self._raise_error("expected %s(%s), got %s(%s)", + type, token.tok_name[type], self.type, self.value) value = self.value self._gettoken() return value diff --git a/parso/python/errors.py b/parso/python/errors.py index 2920ed2..cfb8380 100644 --- a/parso/python/errors.py +++ b/parso/python/errors.py @@ -563,7 +563,8 @@ class _ReturnAndYieldChecks(SyntaxRule): and self._normalizer.version == (3, 5): self.add_issue(self.get_node(leaf), message=self.message_async_yield) -@ErrorFinder.register_rule(type='atom') + +@ErrorFinder.register_rule(type='strings') class _BytesAndStringMix(SyntaxRule): # e.g. 's' b'' message = "cannot mix bytes and nonbytes literals" @@ -842,101 +843,36 @@ class _TryStmtRule(SyntaxRule): self.add_issue(default_except, message=self.message) -@ErrorFinder.register_rule(type='string') +@ErrorFinder.register_rule(type='fstring') class _FStringRule(SyntaxRule): _fstring_grammar = None - message_empty = "f-string: empty expression not allowed" # f'{}' - message_single_closing = "f-string: single '}' is not allowed" # f'}' message_nested = "f-string: expressions nested too deeply" - message_backslash = "f-string expression part cannot include a backslash" # f'{"\"}' or f'{"\\"}' - message_comment = "f-string expression part cannot include '#'" # f'{#}' - message_unterminated_string = "f-string: unterminated string" # f'{"}' message_conversion = "f-string: invalid conversion character: expected 's', 'r', or 'a'" - message_incomplete = "f-string: expecting '}'" # f'{' - message_syntax = "invalid syntax" - @classmethod - def _load_grammar(cls): - import parso + def _check_format_spec(self, format_spec, depth): + self._check_fstring_contents(format_spec.children[1:], depth) - if cls._fstring_grammar is None: - cls._fstring_grammar = parso.load_grammar(language='python-f-string') - return cls._fstring_grammar + def _check_fstring_expr(self, fstring_expr, depth): + if depth >= 2: + self.add_issue(fstring_expr, message=self.message_nested) + + conversion = fstring_expr.children[2] + if conversion.type == 'fstring_conversion': + name = conversion.children[1] + if name.value not in ('s', 'r', 'a'): + self.add_issue(name, message=self.message_conversion) + + format_spec = fstring_expr.children[-2] + if format_spec.type == 'fstring_format_spec': + self._check_format_spec(format_spec, depth + 1) def is_issue(self, fstring): - if 'f' not in fstring.string_prefix.lower(): - return + self._check_fstring_contents(fstring.children[1:-1]) - parsed = self._load_grammar().parse_leaf(fstring) - for child in parsed.children: - if child.type == 'expression': - self._check_expression(child) - elif child.type == 'error_node': - next_ = child.get_next_leaf() - if next_.type == 'error_leaf' and next_.original_type == 'unterminated_string': - self.add_issue(next_, message=self.message_unterminated_string) - # At this point nothing more is comming except the error - # leaf that we've already checked here. - break - self.add_issue(child, message=self.message_incomplete) - elif child.type == 'error_leaf': - self.add_issue(child, message=self.message_single_closing) - - def _check_python_expr(self, python_expr): - value = python_expr.value - if '\\' in value: - self.add_issue(python_expr, message=self.message_backslash) - return - if '#' in value: - self.add_issue(python_expr, message=self.message_comment) - return - if re.match('\s*$', value) is not None: - self.add_issue(python_expr, message=self.message_empty) - return - - # This is now nested parsing. We parsed the fstring and now - # we're parsing Python again. - try: - # CPython has a bit of a special ways to parse Python code within - # f-strings. It wraps the code in brackets to make sure that - # whitespace doesn't make problems (indentation/newlines). - # Just use that algorithm as well here and adapt start positions. - start_pos = python_expr.start_pos - start_pos = start_pos[0], start_pos[1] - 1 - eval_input = self._normalizer.grammar._parse( - '(%s)' % value, - start_symbol='eval_input', - start_pos=start_pos, - error_recovery=False - ) - except ParserSyntaxError as e: - self.add_issue(e.error_leaf, message=self.message_syntax) - return - - issues = self._normalizer.grammar.iter_errors(eval_input) - self._normalizer.issues += issues - - def _check_format_spec(self, format_spec): - for expression in format_spec.children[1:]: - nested_format_spec = expression.children[-2] - if nested_format_spec.type == 'format_spec': - if len(nested_format_spec.children) > 1: - self.add_issue( - nested_format_spec.children[1], - message=self.message_nested - ) - - self._check_expression(expression) - - def _check_expression(self, expression): - for c in expression.children: - if c.type == 'python_expr': - self._check_python_expr(c) - elif c.type == 'conversion': - if c.value not in ('s', 'r', 'a'): - self.add_issue(c, message=self.message_conversion) - elif c.type == 'format_spec': - self._check_format_spec(c) + def _check_fstring_contents(self, children, depth=0): + for fstring_content in children: + if fstring_content.type == 'fstring_expr': + self._check_fstring_expr(fstring_content, depth) class _CheckAssignmentRule(SyntaxRule): @@ -949,7 +885,7 @@ class _CheckAssignmentRule(SyntaxRule): first, second = node.children[:2] error = _get_comprehension_type(node) if error is None: - if second.type in ('dictorsetmaker', 'string'): + if second.type == 'dictorsetmaker': error = 'literal' elif first in ('(', '['): if second.type == 'yield_expr': @@ -968,7 +904,7 @@ class _CheckAssignmentRule(SyntaxRule): error = 'Ellipsis' elif type_ == 'comparison': error = 'comparison' - elif type_ in ('string', 'number'): + elif type_ in ('string', 'number', 'strings'): error = 'literal' elif type_ == 'yield_expr': # This one seems to be a slightly different warning in Python. diff --git a/parso/python/fstring.py b/parso/python/fstring.py deleted file mode 100644 index a8fe7b4..0000000 --- a/parso/python/fstring.py +++ /dev/null @@ -1,211 +0,0 @@ -import re - -from itertools import count -from parso.utils import PythonVersionInfo -from parso.utils import split_lines -from parso.python.tokenize import Token -from parso import parser -from parso.tree import TypedLeaf, ErrorNode, ErrorLeaf - -version36 = PythonVersionInfo(3, 6) - - -class TokenNamespace: - _c = count() - LBRACE = next(_c) - RBRACE = next(_c) - ENDMARKER = next(_c) - COLON = next(_c) - CONVERSION = next(_c) - PYTHON_EXPR = next(_c) - EXCLAMATION_MARK = next(_c) - UNTERMINATED_STRING = next(_c) - - token_map = dict((v, k) for k, v in locals().items() if not k.startswith('_')) - - @classmethod - def generate_token_id(cls, string): - if string == '{': - return cls.LBRACE - elif string == '}': - return cls.RBRACE - elif string == '!': - return cls.EXCLAMATION_MARK - elif string == ':': - return cls.COLON - return getattr(cls, string) - - -GRAMMAR = """ -fstring: expression* ENDMARKER -format_spec: ':' expression* -expression: '{' PYTHON_EXPR [ '!' CONVERSION ] [ format_spec ] '}' -""" - -_prefix = r'((?:[^{}]+)*)' -_expr = _prefix + r'(\{|\}|$)' -_in_expr = r'([^{}\[\]:"\'!]*)(.?)' -# There's only one conversion character allowed. But the rules have to be -# checked later anyway, so allow more here. This makes error recovery nicer. -_conversion = r'([^={}:]*)(.?)' - -_compiled_expr = re.compile(_expr) -_compiled_in_expr = re.compile(_in_expr) -_compiled_conversion = re.compile(_conversion) - - -def tokenize(code, start_pos=(1, 0)): - def add_to_pos(string): - lines = split_lines(string) - l = len(lines[-1]) - if len(lines) > 1: - start_pos[0] += len(lines) - 1 - start_pos[1] = l - else: - start_pos[1] += l - - def tok(value, type=None, prefix=''): - if type is None: - type = TokenNamespace.generate_token_id(value) - - add_to_pos(prefix) - token = Token(type, value, tuple(start_pos), prefix) - add_to_pos(value) - return token - - start = 0 - recursion_level = 0 - added_prefix = '' - start_pos = list(start_pos) - while True: - match = _compiled_expr.match(code, start) - prefix = added_prefix + match.group(1) - found = match.group(2) - start = match.end() - if not found: - # We're at the end. - break - - if found == '}': - if recursion_level == 0 and len(code) > start and code[start] == '}': - # This is a }} escape. - added_prefix = prefix + '}}' - start += 1 - continue - - recursion_level = max(0, recursion_level - 1) - yield tok(found, prefix=prefix) - added_prefix = '' - else: - assert found == '{' - if recursion_level == 0 and len(code) > start and code[start] == '{': - # This is a {{ escape. - added_prefix = prefix + '{{' - start += 1 - continue - - recursion_level += 1 - yield tok(found, prefix=prefix) - added_prefix = '' - - expression = '' - squared_count = 0 - curly_count = 0 - while True: - expr_match = _compiled_in_expr.match(code, start) - expression += expr_match.group(1) - found = expr_match.group(2) - start = expr_match.end() - - if found == '{': - curly_count += 1 - expression += found - elif found == '}' and curly_count > 0: - curly_count -= 1 - expression += found - elif found == '[': - squared_count += 1 - expression += found - elif found == ']': - # Use a max function here, because the Python code might - # just have syntax errors. - squared_count = max(0, squared_count - 1) - expression += found - elif found == ':' and (squared_count or curly_count): - expression += found - elif found in ('"', "'"): - search = found - if len(code) > start + 1 and \ - code[start] == found == code[start+1]: - search *= 3 - start += 2 - - index = code.find(search, start) - if index == -1: - yield tok(expression, type=TokenNamespace.PYTHON_EXPR) - yield tok( - found + code[start:], - type=TokenNamespace.UNTERMINATED_STRING, - ) - start = len(code) - break - expression += found + code[start:index+1] - start = index + 1 - elif found == '!' and len(code) > start and code[start] == '=': - # This is a python `!=` and not a conversion. - expression += found - else: - yield tok(expression, type=TokenNamespace.PYTHON_EXPR) - if found: - yield tok(found) - break - - if found == '!': - conversion_match = _compiled_conversion.match(code, start) - found = conversion_match.group(2) - start = conversion_match.end() - yield tok(conversion_match.group(1), type=TokenNamespace.CONVERSION) - if found: - yield tok(found) - if found == '}': - recursion_level -= 1 - - # We don't need to handle everything after ':', because that is - # basically new tokens. - - yield tok('', type=TokenNamespace.ENDMARKER, prefix=prefix) - - -class Parser(parser.BaseParser): - def parse(self, tokens): - node = super(Parser, self).parse(tokens) - if isinstance(node, self.default_leaf): # Is an endmarker. - # If there's no curly braces we get back a non-module. We always - # want an fstring. - node = self.default_node('fstring', [node]) - - return node - - def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos): - # TODO this is so ugly. - leaf_type = TokenNamespace.token_map[type].lower() - return TypedLeaf(leaf_type, value, start_pos, prefix) - - def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix, - add_token_callback): - if not self._error_recovery: - return super(Parser, self).error_recovery( - pgen_grammar, stack, arcs, typ, value, start_pos, prefix, - add_token_callback - ) - - token_type = TokenNamespace.token_map[typ].lower() - if len(stack) == 1: - error_leaf = ErrorLeaf(token_type, value, start_pos, prefix) - stack[0][2][1].append(error_leaf) - else: - dfa, state, (type_, nodes) = stack[1] - stack[0][2][1].append(ErrorNode(nodes)) - stack[1:] = [] - - add_token_callback(typ, value, start_pos, prefix) diff --git a/parso/python/grammar26.txt b/parso/python/grammar26.txt index b972a41..d9cede2 100644 --- a/parso/python/grammar26.txt +++ b/parso/python/grammar26.txt @@ -119,7 +119,8 @@ atom: ('(' [yield_expr|testlist_comp] ')' | '[' [listmaker] ']' | '{' [dictorsetmaker] '}' | '`' testlist1 '`' | - NAME | NUMBER | STRING+) + NAME | NUMBER | strings) +strings: STRING+ listmaker: test ( list_for | (',' test)* [','] ) # Dave: Renamed testlist_gexpr to testlist_comp, because in 2.7+ this is the # default. It's more consistent like this. diff --git a/parso/python/grammar27.txt b/parso/python/grammar27.txt index 4c3f33d..359f12b 100644 --- a/parso/python/grammar27.txt +++ b/parso/python/grammar27.txt @@ -104,7 +104,8 @@ atom: ('(' [yield_expr|testlist_comp] ')' | '[' [listmaker] ']' | '{' [dictorsetmaker] '}' | '`' testlist1 '`' | - NAME | NUMBER | STRING+) + NAME | NUMBER | strings) +strings: STRING+ listmaker: test ( list_for | (',' test)* [','] ) testlist_comp: test ( comp_for | (',' test)* [','] ) lambdef: 'lambda' [varargslist] ':' test diff --git a/parso/python/grammar33.txt b/parso/python/grammar33.txt index d7aaffd..3a55809 100644 --- a/parso/python/grammar33.txt +++ b/parso/python/grammar33.txt @@ -103,7 +103,8 @@ power: atom trailer* ['**' factor] atom: ('(' [yield_expr|testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dictorsetmaker] '}' | - NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') + NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False') +strings: STRING+ testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME subscriptlist: subscript (',' subscript)* [','] diff --git a/parso/python/grammar34.txt b/parso/python/grammar34.txt index 05c3181..324bba1 100644 --- a/parso/python/grammar34.txt +++ b/parso/python/grammar34.txt @@ -103,7 +103,8 @@ power: atom trailer* ['**' factor] atom: ('(' [yield_expr|testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dictorsetmaker] '}' | - NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') + NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False') +strings: STRING+ testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME subscriptlist: subscript (',' subscript)* [','] diff --git a/parso/python/grammar35.txt b/parso/python/grammar35.txt index c38217f..5868b8f 100644 --- a/parso/python/grammar35.txt +++ b/parso/python/grammar35.txt @@ -110,7 +110,8 @@ atom_expr: ['await'] atom trailer* atom: ('(' [yield_expr|testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dictorsetmaker] '}' | - NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') + NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False') +strings: STRING+ testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME subscriptlist: subscript (',' subscript)* [','] diff --git a/parso/python/grammar36.txt b/parso/python/grammar36.txt index e76147e..b82c1fe 100644 --- a/parso/python/grammar36.txt +++ b/parso/python/grammar36.txt @@ -108,7 +108,7 @@ atom_expr: ['await'] atom trailer* atom: ('(' [yield_expr|testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dictorsetmaker] '}' | - NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') + NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False') testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME subscriptlist: subscript (',' subscript)* [','] @@ -148,3 +148,10 @@ encoding_decl: NAME yield_expr: 'yield' [yield_arg] yield_arg: 'from' test | testlist + +strings: (STRING | fstring)+ +fstring: FSTRING_START fstring_content* FSTRING_END +fstring_content: FSTRING_STRING | fstring_expr +fstring_conversion: '!' NAME +fstring_expr: '{' testlist_comp [ fstring_conversion ] [ fstring_format_spec ] '}' +fstring_format_spec: ':' fstring_content* diff --git a/parso/python/grammar37.txt b/parso/python/grammar37.txt index e76147e..7d112f7 100644 --- a/parso/python/grammar37.txt +++ b/parso/python/grammar37.txt @@ -108,7 +108,7 @@ atom_expr: ['await'] atom trailer* atom: ('(' [yield_expr|testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dictorsetmaker] '}' | - NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') + NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False') testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME subscriptlist: subscript (',' subscript)* [','] @@ -148,3 +148,10 @@ encoding_decl: NAME yield_expr: 'yield' [yield_arg] yield_arg: 'from' test | testlist + +strings: (STRING | fstring)+ +fstring: FSTRING_START fstring_content* FSTRING_END +fstring_content: FSTRING_STRING | fstring_expr +fstring_conversion: '!' NAME +fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}' +fstring_format_spec: ':' fstring_content* diff --git a/parso/python/parser.py b/parso/python/parser.py index 1897f53..4eb9241 100644 --- a/parso/python/parser.py +++ b/parso/python/parser.py @@ -1,6 +1,6 @@ from parso.python import tree from parso.python.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER, - STRING, tok_name, NAME) + STRING, tok_name, NAME, FSTRING_STRING) from parso.parser import BaseParser from parso.pgen2.parse import token_to_ilabel @@ -129,6 +129,8 @@ class Parser(BaseParser): return tree.Newline(value, start_pos, prefix) elif type == ENDMARKER: return tree.EndMarker(value, start_pos, prefix) + elif type == FSTRING_STRING: + return tree.FStringString(value, start_pos, prefix) else: return tree.Operator(value, start_pos, prefix) diff --git a/parso/python/token.py b/parso/python/token.py index fb590a5..dd849b0 100644 --- a/parso/python/token.py +++ b/parso/python/token.py @@ -32,6 +32,14 @@ if py_version < 35: ERROR_DEDENT = next(_counter) tok_name[ERROR_DEDENT] = 'ERROR_DEDENT' +FSTRING_START = next(_counter) +tok_name[FSTRING_START] = 'FSTRING_START' +FSTRING_END = next(_counter) +tok_name[FSTRING_END] = 'FSTRING_END' +FSTRING_STRING = next(_counter) +tok_name[FSTRING_STRING] = 'FSTRING_STRING' +EXCLAMATION = next(_counter) +tok_name[EXCLAMATION] = 'EXCLAMATION' # Map from operator to number (since tokenize doesn't do this) @@ -84,6 +92,7 @@ opmap_raw = """\ //= DOUBLESLASHEQUAL -> RARROW ... ELLIPSIS +! EXCLAMATION """ opmap = {} diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index ecd2437..7d72fc6 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -20,14 +20,15 @@ from codecs import BOM_UTF8 from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap, NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT, - ERROR_DEDENT) + ERROR_DEDENT, FSTRING_STRING, FSTRING_START, + FSTRING_END) from parso._compatibility import py_version from parso.utils import split_lines TokenCollection = namedtuple( 'TokenCollection', - 'pseudo_token single_quoted triple_quoted endpats always_break_tokens', + 'pseudo_token single_quoted triple_quoted endpats fstring_pattern_map always_break_tokens', ) BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') @@ -52,32 +53,35 @@ def group(*choices, **kwargs): return start + '|'.join(choices) + ')' -def any(*choices): - return group(*choices) + '*' - - def maybe(*choices): return group(*choices) + '?' # Return the empty string, plus all of the valid string prefixes. -def _all_string_prefixes(version_info): +def _all_string_prefixes(version_info, include_fstring=False, only_fstring=False): def different_case_versions(prefix): for s in _itertools.product(*[(c, c.upper()) for c in prefix]): yield ''.join(s) # The valid string prefixes. Only contain the lower case versions, # and don't contain any permuations (include 'fr', but not # 'rf'). The various permutations will be generated. - _valid_string_prefixes = ['b', 'r', 'u'] + valid_string_prefixes = ['b', 'r', 'u'] if version_info >= (3, 0): - _valid_string_prefixes.append('br') + valid_string_prefixes.append('br') - if version_info >= (3, 6): - _valid_string_prefixes += ['f', 'fr'] + result = set(['']) + if version_info >= (3, 6) and include_fstring: + f = ['f', 'fr'] + if only_fstring: + valid_string_prefixes = f + result = set() + else: + valid_string_prefixes += f + elif only_fstring: + return set() # if we add binary f-strings, add: ['fb', 'fbr'] - result = set(['']) - for prefix in _valid_string_prefixes: + for prefix in valid_string_prefixes: for t in _itertools.permutations(prefix): # create a list with upper and lower versions of each # character @@ -102,6 +106,10 @@ def _get_token_collection(version_info): return result +fstring_string_single_line = _compile(r'(?:[^{}\r\n]+|\{\{|\}\})+') +fstring_string_multi_line = _compile(r'(?:[^{}]+|\{\{|\}\})+') + + def _create_token_collection(version_info): # Note: we use unicode matching for names ("\w") but ascii matching for # number literals. @@ -141,6 +149,9 @@ def _create_token_collection(version_info): # StringPrefix can be the empty string (making it optional). possible_prefixes = _all_string_prefixes(version_info) StringPrefix = group(*possible_prefixes) + StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True)) + fstring_prefixes = _all_string_prefixes(version_info, include_fstring=True, only_fstring=True) + FStringStart = group(*fstring_prefixes) # Tail end of ' string. Single = r"[^'\\]*(?:\\.[^'\\]*)*'" @@ -150,14 +161,14 @@ def _create_token_collection(version_info): Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" # Tail end of """ string. Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' - Triple = group(StringPrefix + "'''", StringPrefix + '"""') + Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""') # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get # recognized as two instances of =). - Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", + Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"//=?", r"->", - r"[+\-*/%&@`|^=<>]=?", + r"[+\-*/%&@`|^!=<>]=?", r"~") Bracket = '[][(){}]' @@ -174,7 +185,12 @@ def _create_token_collection(version_info): group("'", r'\\\r?\n'), StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n')) - PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) + pseudo_extra_pool = [Comment, Triple] + all_quotes = '"', "'", '"""', "'''" + if fstring_prefixes: + pseudo_extra_pool.append(FStringStart + group(*all_quotes)) + + PseudoExtras = group(r'\\\r?\n|\Z', *pseudo_extra_pool) PseudoToken = group(Whitespace, capture=True) + \ group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) @@ -192,18 +208,24 @@ def _create_token_collection(version_info): # including the opening quotes. single_quoted = set() triple_quoted = set() + fstring_pattern_map = {} for t in possible_prefixes: - for p in (t + '"', t + "'"): - single_quoted.add(p) - for p in (t + '"""', t + "'''"): - triple_quoted.add(p) + for quote in '"', "'": + single_quoted.add(t + quote) + + for quote in '"""', "'''": + triple_quoted.add(t + quote) + + for t in fstring_prefixes: + for quote in all_quotes: + fstring_pattern_map[t + quote] = quote ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', 'finally', 'while', 'with', 'return') pseudo_token_compiled = _compile(PseudoToken) return TokenCollection( pseudo_token_compiled, single_quoted, triple_quoted, endpats, - ALWAYS_BREAK_TOKENS + fstring_pattern_map, ALWAYS_BREAK_TOKENS ) @@ -226,6 +248,86 @@ class PythonToken(Token): self._replace(type=self._get_type_name())) +class FStringNode(object): + def __init__(self, quote): + self.quote = quote + self.parentheses_count = 0 + self.previous_lines = '' + self.last_string_start_pos = None + # In the syntax there can be multiple format_spec's nested: + # {x:{y:3}} + self.format_spec_count = 0 + + def open_parentheses(self, character): + self.parentheses_count += 1 + + def close_parentheses(self, character): + self.parentheses_count -= 1 + + def allow_multiline(self): + return len(self.quote) == 3 + + def is_in_expr(self): + return (self.parentheses_count - self.format_spec_count) > 0 + + +def _check_fstring_ending(fstring_stack, token, from_start=False): + fstring_end = float('inf') + fstring_index = None + for i, node in enumerate(fstring_stack): + if from_start: + if token.startswith(node.quote): + fstring_index = i + fstring_end = len(node.quote) + else: + continue + else: + try: + end = token.index(node.quote) + except ValueError: + pass + else: + if fstring_index is None or end < fstring_end: + fstring_index = i + fstring_end = end + return fstring_index, fstring_end + + +def _find_fstring_string(fstring_stack, line, lnum, pos): + tos = fstring_stack[-1] + if tos.is_in_expr(): + return '', pos + else: + new_pos = pos + allow_multiline = tos.allow_multiline() + if allow_multiline: + match = fstring_string_multi_line.match(line, pos) + else: + match = fstring_string_single_line.match(line, pos) + if match is None: + string = tos.previous_lines + else: + print(match, lnum, pos, repr(tos.previous_lines)) + if not tos.previous_lines: + tos.last_string_start_pos = (lnum, pos) + + string = match.group(0) + for fstring_stack_node in fstring_stack: + try: + string = string[:string.index(fstring_stack_node.quote)] + except ValueError: + pass # The string was not found. + + new_pos += len(string) + if allow_multiline and string.endswith('\n'): + tos.previous_lines += string + string = '' + else: + string = tos.previous_lines + string + + return string, new_pos + + def tokenize(code, version_info, start_pos=(1, 0)): """Generate tokens from a the source code (string).""" lines = split_lines(code, keepends=True) @@ -240,7 +342,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): token. This idea comes from lib2to3. The prefix contains all information that is irrelevant for the parser like newlines in parentheses or comments. """ - pseudo_token, single_quoted, triple_quoted, endpats, always_break_tokens, = \ + pseudo_token, single_quoted, triple_quoted, endpats, fstring_pattern_map, always_break_tokens, = \ _get_token_collection(version_info) paren_level = 0 # count parentheses indents = [0] @@ -257,6 +359,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): additional_prefix = '' first = True lnum = start_pos[0] - 1 + fstring_stack = [] for line in lines: # loop over lines in stream lnum += 1 pos = 0 @@ -287,6 +390,33 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): continue while pos < max: + if fstring_stack: + string, pos = _find_fstring_string(fstring_stack, line, lnum, pos) + if string: + yield PythonToken( + FSTRING_STRING, string, + fstring_stack[-1].last_string_start_pos, '' + ) + fstring_stack[-1].previous_lines = '' + continue + + if pos == max: + break + + rest = line[pos:] + fstring_index, end = _check_fstring_ending(fstring_stack, rest, from_start=True) + + if fstring_index is not None: + yield PythonToken( + FSTRING_END, + fstring_stack[fstring_index].quote, + (lnum, pos), + prefix='' + ) + del fstring_stack[fstring_index:] + pos += end + continue + pseudomatch = pseudo_token.match(line, pos) if not pseudomatch: # scan for tokens txt = line[pos:] @@ -311,10 +441,11 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): if new_line and initial not in '\r\n#': new_line = False - if paren_level == 0: + if paren_level == 0 and not fstring_stack: i = 0 while line[i] == '\f': i += 1 + # TODO don't we need to change spos as well? start -= 1 if start > indents[-1]: yield PythonToken(INDENT, '', spos, '') @@ -326,11 +457,30 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): yield PythonToken(DEDENT, '', spos, '') indents.pop() + if fstring_stack: + fstring_index, end = _check_fstring_ending(fstring_stack, token) + if fstring_index is not None: + if end != 0: + yield PythonToken(ERRORTOKEN, token[:end], spos, prefix) + + yield PythonToken( + FSTRING_END, + fstring_stack[fstring_index].quote, + (lnum, spos[1] + 1), + prefix='' + ) + del fstring_stack[fstring_index:] + pos -= len(token) - end + continue + if (initial in numchars or # ordinary number (initial == '.' and token != '.' and token != '...')): yield PythonToken(NUMBER, token, spos, prefix) elif initial in '\r\n': - if not new_line and paren_level == 0: + if any(not f.allow_multiline() for f in fstring_stack): + fstring_stack.clear() + + if not new_line and paren_level == 0 and not fstring_stack: yield PythonToken(NEWLINE, token, spos, prefix) else: additional_prefix = prefix + token @@ -362,8 +512,12 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): break else: # ordinary string yield PythonToken(STRING, token, spos, prefix) + elif token in fstring_pattern_map: # The start of an fstring. + fstring_stack.append(FStringNode(fstring_pattern_map[token])) + yield PythonToken(FSTRING_START, token, spos, prefix) elif is_identifier(initial): # ordinary name if token in always_break_tokens: + fstring_stack.clear() paren_level = 0 while True: indent = indents.pop() @@ -378,9 +532,18 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): break else: if token in '([{': - paren_level += 1 + if fstring_stack: + fstring_stack[-1].open_parentheses(token) + else: + paren_level += 1 elif token in ')]}': - paren_level -= 1 + if fstring_stack: + fstring_stack[-1].close_parentheses(token) + else: + paren_level -= 1 + elif token == ':' and fstring_stack \ + and fstring_stack[-1].parentheses_count == 1: + fstring_stack[-1].format_spec_count += 1 try: # This check is needed in any case to check if it's a valid diff --git a/parso/python/tree.py b/parso/python/tree.py index eb97780..075505f 100644 --- a/parso/python/tree.py +++ b/parso/python/tree.py @@ -262,6 +262,15 @@ class String(Literal): return match.group(2)[:-len(match.group(1))] +class FStringString(Leaf): + """ + f-strings contain f-string expressions and normal python strings. These are + the string parts of f-strings. + """ + type = 'fstring_string' + __slots__ = () + + class _StringComparisonMixin(object): def __eq__(self, other): """ diff --git a/test/failing_examples.py b/test/failing_examples.py index 4227679..25e93ca 100644 --- a/test/failing_examples.py +++ b/test/failing_examples.py @@ -141,7 +141,7 @@ FAILING_EXAMPLES = [ # f-strings 'f"{}"', - 'f"{\\}"', + r'f"{\}"', 'f"{\'\\\'}"', 'f"{#}"', "f'{1!b}'", diff --git a/test/test_fstring.py b/test/test_fstring.py index 936d7fb..59debd9 100644 --- a/test/test_fstring.py +++ b/test/test_fstring.py @@ -1,17 +1,18 @@ import pytest from parso import load_grammar, ParserSyntaxError -from parso.python.fstring import tokenize +from parso.python.tokenize import tokenize @pytest.fixture def grammar(): - return load_grammar(language="python-f-string") + return load_grammar(version='3.6') @pytest.mark.parametrize( 'code', [ '{1}', + '{1:}', '', '{1!a}', '{1!a:1}', @@ -26,22 +27,12 @@ def grammar(): '{{{1}', '1{{2{{3', '}}', - '{:}}}', - - # Invalid, but will be checked, later. - '{}', - '{1:}', - '{:}', - '{:1}', - '{!:}', - '{!}', - '{!a}', - '{1:{}}', - '{1:{:}}', ] ) def test_valid(code, grammar): - fstring = grammar.parse(code, error_recovery=False) + code = 'f"""%s"""' % code + module = grammar.parse(code, error_recovery=False) + fstring = module.children[0] assert fstring.type == 'fstring' assert fstring.get_code() == code @@ -52,24 +43,37 @@ def test_valid(code, grammar): '{', '{1!{a}}', '{!{a}}', + '{}', + '{:}', + '{:}}}', + '{:1}', + '{!:}', + '{!}', + '{!a}', + '{1:{}}', + '{1:{:}}', ] ) def test_invalid(code, grammar): + code = 'f"""%s"""' % code with pytest.raises(ParserSyntaxError): grammar.parse(code, error_recovery=False) # It should work with error recovery. - #grammar.parse(code, error_recovery=True) + grammar.parse(code, error_recovery=True) @pytest.mark.parametrize( - ('code', 'start_pos', 'positions'), [ + ('code', 'positions'), [ # 2 times 2, 5 because python expr and endmarker. - ('}{', (2, 3), [(2, 3), (2, 4), (2, 5), (2, 5)]), - (' :{ 1 : } ', (1, 0), [(1, 2), (1, 3), (1, 6), (1, 8), (1, 10)]), - ('\n{\nfoo\n }', (2, 1), [(3, 0), (3, 1), (5, 1), (5, 2)]), + ('f"}{"', [(1, 0), (1, 2), (1, 3), (1, 4), (1, 5)]), + ('f" :{ 1 : } "', [(1, 0), (1, 2), (1, 4), (1, 6), (1, 8), (1, 9), + (1, 10), (1, 11), (1, 12), (1, 13)]), + ('f"""\n {\nfoo\n }"""', [(1, 0), (1, 4), (2, 1), (3, 0), (4, 1), + (4, 2), (4, 5)]), ] ) -def test_tokenize_start_pos(code, start_pos, positions): - tokens = tokenize(code, start_pos) +def test_tokenize_start_pos(code, positions): + tokens = list(tokenize(code, version_info=(3, 6))) + print(tokens) assert positions == [p.start_pos for p in tokens] diff --git a/test/test_python_errors.py b/test/test_python_errors.py index b724a41..67f3e1d 100644 --- a/test/test_python_errors.py +++ b/test/test_python_errors.py @@ -114,6 +114,22 @@ def _get_actual_exception(code): # Python 3.4/3.4 have a bit of a different warning than 3.5/3.6 in # certain places. But in others this error makes sense. return [wanted, "SyntaxError: can't use starred expression here"], line_nr + elif wanted == 'SyntaxError: f-string: unterminated string': + wanted = 'SyntaxError: EOL while scanning string literal' + elif wanted == 'SyntaxError: f-string expression part cannot include a backslash': + return [ + wanted, + "SyntaxError: EOL while scanning string literal", + "SyntaxError: unexpected character after line continuation character", + ], line_nr + elif wanted == "SyntaxError: f-string: expecting '}'": + wanted = 'SyntaxError: EOL while scanning string literal' + elif wanted == 'SyntaxError: f-string: empty expression not allowed': + wanted = 'SyntaxError: invalid syntax' + elif wanted == "SyntaxError: f-string expression part cannot include '#'": + wanted = 'SyntaxError: invalid syntax' + elif wanted == "SyntaxError: f-string: single '}' is not allowed": + wanted = 'SyntaxError: invalid syntax' return [wanted], line_nr diff --git a/test/test_tokenize.py b/test/test_tokenize.py index bd62d95..2951380 100644 --- a/test/test_tokenize.py +++ b/test/test_tokenize.py @@ -7,7 +7,8 @@ import pytest from parso._compatibility import py_version from parso.utils import split_lines, parse_version_string from parso.python.token import ( - NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER, ERROR_DEDENT) + NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER, ERROR_DEDENT, + FSTRING_START) from parso.python import tokenize from parso import parse from parso.python.tokenize import PythonToken @@ -162,8 +163,9 @@ def test_ur_literals(): token_list = _get_token_list(literal) typ, result_literal, _, _ = token_list[0] if is_literal: - assert typ == STRING - assert result_literal == literal + if typ != FSTRING_START: + assert typ == STRING + assert result_literal == literal else: assert typ == NAME @@ -175,6 +177,7 @@ def test_ur_literals(): # Starting with Python 3.3 this ordering is also possible. if py_version >= 33: check('Rb""') + # Starting with Python 3.6 format strings where introduced. check('fr""', is_literal=py_version >= 36) check('rF""', is_literal=py_version >= 36) diff --git a/tox.ini b/tox.ini index 4b6d3b2..344aacc 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py26, py27, py33, py34, py35, py36 +envlist = py26, py27, py33, py34, py35, py36, py37 [testenv] deps = {env:_PARSO_TEST_PYTEST_DEP:pytest>=3.0.7}