From 9f88fe16a374cf3295578792cd0f3c8334b40eb5 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Wed, 28 Mar 2018 02:03:18 +0200 Subject: [PATCH 01/16] Added the fstring grammar without the tokenization part This means that fstrings are not yet parsed, because there are no f-string tokens. --- parso/python/errors.py | 7 ++++--- parso/python/grammar36.txt | 8 +++++++- parso/python/token.py | 8 ++++++++ parso/python/tokenize.py | 7 +++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/parso/python/errors.py b/parso/python/errors.py index 2920ed2..fe1e273 100644 --- a/parso/python/errors.py +++ b/parso/python/errors.py @@ -563,7 +563,8 @@ class _ReturnAndYieldChecks(SyntaxRule): and self._normalizer.version == (3, 5): self.add_issue(self.get_node(leaf), message=self.message_async_yield) -@ErrorFinder.register_rule(type='atom') + +@ErrorFinder.register_rule(type='strings') class _BytesAndStringMix(SyntaxRule): # e.g. 's' b'' message = "cannot mix bytes and nonbytes literals" @@ -949,7 +950,7 @@ class _CheckAssignmentRule(SyntaxRule): first, second = node.children[:2] error = _get_comprehension_type(node) if error is None: - if second.type in ('dictorsetmaker', 'string'): + if second.type == 'dictorsetmaker': error = 'literal' elif first in ('(', '['): if second.type == 'yield_expr': @@ -968,7 +969,7 @@ class _CheckAssignmentRule(SyntaxRule): error = 'Ellipsis' elif type_ == 'comparison': error = 'comparison' - elif type_ in ('string', 'number'): + elif type_ in ('string', 'number', 'strings'): error = 'literal' elif type_ == 'yield_expr': # This one seems to be a slightly different warning in Python. diff --git a/parso/python/grammar36.txt b/parso/python/grammar36.txt index e76147e..5b434b8 100644 --- a/parso/python/grammar36.txt +++ b/parso/python/grammar36.txt @@ -108,7 +108,7 @@ atom_expr: ['await'] atom trailer* atom: ('(' [yield_expr|testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dictorsetmaker] '}' | - NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') + NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False') testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME subscriptlist: subscript (',' subscript)* [','] @@ -148,3 +148,9 @@ encoding_decl: NAME yield_expr: 'yield' [yield_arg] yield_arg: 'from' test | testlist + +strings: (STRING | fstring)+ +fstring: FSTRING_START fstring_content FSTRING_END +fstring_content: (FSTRING_STRING | fstring_expr)* +fstring_expr: '{' testlist [ FSTRING_CONVERSION ] [ fstring_format_spec ] '}' +fstring_format_spec: ':' fstring_content diff --git a/parso/python/token.py b/parso/python/token.py index fb590a5..528346b 100644 --- a/parso/python/token.py +++ b/parso/python/token.py @@ -32,6 +32,14 @@ if py_version < 35: ERROR_DEDENT = next(_counter) tok_name[ERROR_DEDENT] = 'ERROR_DEDENT' +FSTRING_START = next(_counter) +tok_name[FSTRING_START] = 'FSTRING_START' +FSTRING_END = next(_counter) +tok_name[FSTRING_END] = 'FSTRING_END' +FSTRING_STRING = next(_counter) +tok_name[FSTRING_STRING] = 'FSTRING_STRING' +FSTRING_CONVERSION = next(_counter) +tok_name[FSTRING_CONVERSION] = 'FSTRING_CONVERSION' # Map from operator to number (since tokenize doesn't do this) diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index ecd2437..8537884 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -25,6 +25,13 @@ from parso._compatibility import py_version from parso.utils import split_lines +#fstring_start = /[f|fr|rf]["|"""|'|''']/ +#fstring_end = +fstring_expr_start = '' +fstring_string = r'([^{}\n]+|\{\{|\}\})*' +fstring_conversion = r'![sra]' + + TokenCollection = namedtuple( 'TokenCollection', 'pseudo_token single_quoted triple_quoted endpats always_break_tokens', From 25e4ea9c24add0579aff0aeed6474d58299f1919 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Wed, 28 Mar 2018 02:16:37 +0200 Subject: [PATCH 02/16] A small improvement in checks --- parso/python/tokenize.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 8537884..2820669 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -59,10 +59,6 @@ def group(*choices, **kwargs): return start + '|'.join(choices) + ')' -def any(*choices): - return group(*choices) + '*' - - def maybe(*choices): return group(*choices) + '?' @@ -357,9 +353,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): contstr = line[start:] contline = line break - elif initial in single_quoted or \ - token[:2] in single_quoted or \ - token[:3] in single_quoted: + elif any(token.startswith(s) for s in single_quoted): if token[-1] == '\n': # continued string contstr_start = lnum, start endprog = (endpats.get(initial) or endpats.get(token[1]) From e05ce5ae3124985e1d22797c7cb751971d6b67cd Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Wed, 28 Mar 2018 09:51:37 +0200 Subject: [PATCH 03/16] Revert "A small improvement in checks" The problem with this commit is that it probably makes some checks slower. It's still slightly more beautiful, but we leave it for now. This reverts commit 25e4ea9c24add0579aff0aeed6474d58299f1919. --- parso/python/tokenize.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 2820669..8537884 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -59,6 +59,10 @@ def group(*choices, **kwargs): return start + '|'.join(choices) + ')' +def any(*choices): + return group(*choices) + '*' + + def maybe(*choices): return group(*choices) + '?' @@ -353,7 +357,9 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): contstr = line[start:] contline = line break - elif any(token.startswith(s) for s in single_quoted): + elif initial in single_quoted or \ + token[:2] in single_quoted or \ + token[:3] in single_quoted: if token[-1] == '\n': # continued string contstr_start = lnum, start endprog = (endpats.get(initial) or endpats.get(token[1]) From d8d2e596a58a8a4058b2b30af7954f2a9c61a72e Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Fri, 30 Mar 2018 20:50:49 +0200 Subject: [PATCH 04/16] A first implementation of the fstring tokenizer --- parso/python/grammar36.txt | 3 +- parso/python/token.py | 2 - parso/python/tokenize.py | 165 +++++++++++++++++++++++++++++++------ 3 files changed, 144 insertions(+), 26 deletions(-) diff --git a/parso/python/grammar36.txt b/parso/python/grammar36.txt index 5b434b8..60076eb 100644 --- a/parso/python/grammar36.txt +++ b/parso/python/grammar36.txt @@ -152,5 +152,6 @@ yield_arg: 'from' test | testlist strings: (STRING | fstring)+ fstring: FSTRING_START fstring_content FSTRING_END fstring_content: (FSTRING_STRING | fstring_expr)* -fstring_expr: '{' testlist [ FSTRING_CONVERSION ] [ fstring_format_spec ] '}' +fstring_conversion: '!' NAME +fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}' fstring_format_spec: ':' fstring_content diff --git a/parso/python/token.py b/parso/python/token.py index 528346b..42f1a89 100644 --- a/parso/python/token.py +++ b/parso/python/token.py @@ -38,8 +38,6 @@ FSTRING_END = next(_counter) tok_name[FSTRING_END] = 'FSTRING_END' FSTRING_STRING = next(_counter) tok_name[FSTRING_STRING] = 'FSTRING_STRING' -FSTRING_CONVERSION = next(_counter) -tok_name[FSTRING_CONVERSION] = 'FSTRING_CONVERSION' # Map from operator to number (since tokenize doesn't do this) diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 8537884..b13f847 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -20,21 +20,15 @@ from codecs import BOM_UTF8 from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap, NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT, - ERROR_DEDENT) + ERROR_DEDENT, FSTRING_STRING, FSTRING_START, + FSTRING_END) from parso._compatibility import py_version from parso.utils import split_lines -#fstring_start = /[f|fr|rf]["|"""|'|''']/ -#fstring_end = -fstring_expr_start = '' -fstring_string = r'([^{}\n]+|\{\{|\}\})*' -fstring_conversion = r'![sra]' - - TokenCollection = namedtuple( 'TokenCollection', - 'pseudo_token single_quoted triple_quoted endpats always_break_tokens', + 'pseudo_token single_quoted triple_quoted endpats fstring_endpats always_break_tokens', ) BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') @@ -68,7 +62,7 @@ def maybe(*choices): # Return the empty string, plus all of the valid string prefixes. -def _all_string_prefixes(version_info): +def _all_string_prefixes(version_info, include_fstring=False): def different_case_versions(prefix): for s in _itertools.product(*[(c, c.upper()) for c in prefix]): yield ''.join(s) @@ -79,7 +73,7 @@ def _all_string_prefixes(version_info): if version_info >= (3, 0): _valid_string_prefixes.append('br') - if version_info >= (3, 6): + if version_info >= (3, 6) and include_fstring: _valid_string_prefixes += ['f', 'fr'] # if we add binary f-strings, add: ['fb', 'fbr'] @@ -109,6 +103,10 @@ def _get_token_collection(version_info): return result +fstring_string_single_line = _compile(r'(?:[^{\r\n]+|\{\{)+') +fstring_string_multi_line = _compile(r'(?:[^{]+|\{\{)+') + + def _create_token_collection(version_info): # Note: we use unicode matching for names ("\w") but ascii matching for # number literals. @@ -148,6 +146,9 @@ def _create_token_collection(version_info): # StringPrefix can be the empty string (making it optional). possible_prefixes = _all_string_prefixes(version_info) StringPrefix = group(*possible_prefixes) + StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True)) + fstring_prefixes = _all_string_prefixes(version_info, include_fstring=True, only_fstring=True) + FStringStart = group(*fstring_prefixes) # Tail end of ' string. Single = r"[^'\\]*(?:\\.[^'\\]*)*'" @@ -157,7 +158,7 @@ def _create_token_collection(version_info): Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" # Tail end of """ string. Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' - Triple = group(StringPrefix + "'''", StringPrefix + '"""') + Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""') # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get @@ -181,7 +182,11 @@ def _create_token_collection(version_info): group("'", r'\\\r?\n'), StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n')) - PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) + pseudo_extra_pool = [Comment, Triple] + if fstring_prefixes: + pseudo_extra_pool.append(FStringStart) + + PseudoExtras = group(r'\\\r?\n|\Z', pseudo_extra_pool) PseudoToken = group(Whitespace, capture=True) + \ group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) @@ -199,18 +204,24 @@ def _create_token_collection(version_info): # including the opening quotes. single_quoted = set() triple_quoted = set() + fstring_endpats = {} for t in possible_prefixes: - for p in (t + '"', t + "'"): - single_quoted.add(p) - for p in (t + '"""', t + "'''"): - triple_quoted.add(p) + for quote in '"', "'": + single_quoted.add(t + quote) + + for quote in '"""', "'''": + triple_quoted.add(t + quote) + + for t in fstring_prefixes: + for quote in '"', "'", '"""', "'''": + fstring_endpats[t + quote] = quote ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', 'finally', 'while', 'with', 'return') pseudo_token_compiled = _compile(PseudoToken) return TokenCollection( pseudo_token_compiled, single_quoted, triple_quoted, endpats, - ALWAYS_BREAK_TOKENS + fstring_endpats, ALWAYS_BREAK_TOKENS ) @@ -233,6 +244,72 @@ class PythonToken(Token): self._replace(type=self._get_type_name())) +class FStringNode(object): + def __init__(self, quote): + self.quote = quote + self.parentheses_count = 0 + self.previous_lines = '' + self.in_format_spec = False + + def open_parentheses(self, character): + self.parentheses_count += 1 + + def close_parentheses(self, character): + self.parentheses_count -= 1 + return self.parentheses_count == 0 + + def allow_multiline(self): + return len(self.quote == 3) + + def is_in_expr(self): + return self.parentheses_count and not self.in_format_spec + + +def _check_fstring_ending(fstring_stack, token): + fstring_end = float('inf') + fstring_index = None + for i, node in enumerate(fstring_stack): + try: + end = token.index(node.quote) + except ValueError: + pass + else: + if fstring_index is None or end < fstring_end: + fstring_index = i + fstring_end = end + return fstring_index, fstring_end + + +def _find_fstring_string(fstring_stack, line, pos): + tos = fstring_stack[-1] + if tos.is_in_expr(): + return '', pos + else: + new_pos = pos + allow_multiline = tos.allow_multiline() + if allow_multiline: + match = fstring_string_multi_line.match(line, pos) + else: + match = fstring_string_single_line.match(line, pos) + if match is None: + string = fstring_stack.previous_lines + else: + string = match.group(0) + for fstring_stack_node in fstring_stack: + try: + string = string[:string.index(fstring_stack_node.quote)] + except ValueError: + pass # The string was not found. + + new_pos += len(string) + if allow_multiline and string.endswith('\n'): + fstring_stack.previous_lines += string + string = '' + else: + string = fstring_stack_node.previous_lines + string + return string, new_pos + + def tokenize(code, version_info, start_pos=(1, 0)): """Generate tokens from a the source code (string).""" lines = split_lines(code, keepends=True) @@ -247,7 +324,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): token. This idea comes from lib2to3. The prefix contains all information that is irrelevant for the parser like newlines in parentheses or comments. """ - pseudo_token, single_quoted, triple_quoted, endpats, always_break_tokens, = \ + pseudo_token, single_quoted, triple_quoted, endpats, fstring_endpats, always_break_tokens, = \ _get_token_collection(version_info) paren_level = 0 # count parentheses indents = [0] @@ -264,6 +341,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): additional_prefix = '' first = True lnum = start_pos[0] - 1 + fstring_stack = [] for line in lines: # loop over lines in stream lnum += 1 pos = 0 @@ -294,6 +372,13 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): continue while pos < max: + if fstring_stack: + string, pos = _find_fstring_string(fstring_stack, line, pos) + if string: + fstring_stack.previous_lines = '' + yield PythonToken(FSTRING_STRING, string, (lnum, pos), '') + continue + pseudomatch = pseudo_token.match(line, pos) if not pseudomatch: # scan for tokens txt = line[pos:] @@ -318,10 +403,11 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): if new_line and initial not in '\r\n#': new_line = False - if paren_level == 0: + if paren_level == 0 and not fstring_stack: i = 0 while line[i] == '\f': i += 1 + # TODO don't we need to change spos as well? start -= 1 if start > indents[-1]: yield PythonToken(INDENT, '', spos, '') @@ -333,11 +419,30 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): yield PythonToken(DEDENT, '', spos, '') indents.pop() + if fstring_stack: + fstring_index, end = _check_fstring_ending(fstring_stack, token) + if fstring_index is not None: + if end != 0: + yield PythonToken(ERRORTOKEN, token[:end], spos, prefix) + + yield PythonToken( + FSTRING_END, + fstring_stack[fstring_index].quote, + (lnum, spos[1] + 1), + prefix='' + ) + del fstring_index[fstring_index:] + pos -= len(token) - end + continue + if (initial in numchars or # ordinary number (initial == '.' and token != '.' and token != '...')): yield PythonToken(NUMBER, token, spos, prefix) elif initial in '\r\n': - if not new_line and paren_level == 0: + if any(not f.allow_multiline() for f in fstring_stack): + fstring_stack.clear() + + if not new_line and paren_level == 0 and not fstring_stack: yield PythonToken(NEWLINE, token, spos, prefix) else: additional_prefix = prefix + token @@ -369,8 +474,12 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): break else: # ordinary string yield PythonToken(STRING, token, spos, prefix) + elif token in fstring_endpats: + fstring_stack.append(FStringNode(fstring_endpats[token])) + yield PythonToken(FSTRING_START, token, spos, prefix) elif is_identifier(initial): # ordinary name if token in always_break_tokens: + fstring_stack.clear() paren_level = 0 while True: indent = indents.pop() @@ -385,9 +494,19 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): break else: if token in '([{': - paren_level += 1 + if fstring_stack: + fstring_stack[-1].open_bracket(token) + else: + paren_level += 1 elif token in ')]}': - paren_level -= 1 + if fstring_stack: + if fstring_stack[-1].close_parentheses(token): + fstring_stack.pop() + else: + paren_level -= 1 + elif token == ':' and fstring_stack \ + and fstring_stack[-1].parentheses_count == 1: + fstring_stack[-1].in_format_spec = True try: # This check is needed in any case to check if it's a valid From 235fda3fbb1b502248f5de7cc8b51ee47c7aed7b Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Fri, 30 Mar 2018 22:13:18 +0200 Subject: [PATCH 05/16] Fix a few things so that the tokenizer can at least parse the grammar. --- parso/pgen2/pgen.py | 5 +++-- parso/python/token.py | 3 +++ parso/python/tokenize.py | 45 ++++++++++++++++++++++------------------ 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/parso/pgen2/pgen.py b/parso/pgen2/pgen.py index 10ef6ff..a3e39fa 100644 --- a/parso/pgen2/pgen.py +++ b/parso/pgen2/pgen.py @@ -28,6 +28,7 @@ class ParserGenerator(object): c = grammar.Grammar(self._bnf_text) names = list(self.dfas.keys()) names.sort() + # TODO do we still need this? names.remove(self.startsymbol) names.insert(0, self.startsymbol) for name in names: @@ -316,8 +317,8 @@ class ParserGenerator(object): def _expect(self, type): if self.type != type: - self._raise_error("expected %s, got %s(%s)", - type, self.type, self.value) + self._raise_error("expected %s(%s), got %s(%s)", + type, token.tok_name[type], self.type, self.value) value = self.value self._gettoken() return value diff --git a/parso/python/token.py b/parso/python/token.py index 42f1a89..dd849b0 100644 --- a/parso/python/token.py +++ b/parso/python/token.py @@ -38,6 +38,8 @@ FSTRING_END = next(_counter) tok_name[FSTRING_END] = 'FSTRING_END' FSTRING_STRING = next(_counter) tok_name[FSTRING_STRING] = 'FSTRING_STRING' +EXCLAMATION = next(_counter) +tok_name[EXCLAMATION] = 'EXCLAMATION' # Map from operator to number (since tokenize doesn't do this) @@ -90,6 +92,7 @@ opmap_raw = """\ //= DOUBLESLASHEQUAL -> RARROW ... ELLIPSIS +! EXCLAMATION """ opmap = {} diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index b13f847..9e1c33f 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -28,7 +28,7 @@ from parso.utils import split_lines TokenCollection = namedtuple( 'TokenCollection', - 'pseudo_token single_quoted triple_quoted endpats fstring_endpats always_break_tokens', + 'pseudo_token single_quoted triple_quoted endpats fstring_pattern_map always_break_tokens', ) BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') @@ -53,32 +53,35 @@ def group(*choices, **kwargs): return start + '|'.join(choices) + ')' -def any(*choices): - return group(*choices) + '*' - - def maybe(*choices): return group(*choices) + '?' # Return the empty string, plus all of the valid string prefixes. -def _all_string_prefixes(version_info, include_fstring=False): +def _all_string_prefixes(version_info, include_fstring=False, only_fstring=False): def different_case_versions(prefix): for s in _itertools.product(*[(c, c.upper()) for c in prefix]): yield ''.join(s) # The valid string prefixes. Only contain the lower case versions, # and don't contain any permuations (include 'fr', but not # 'rf'). The various permutations will be generated. - _valid_string_prefixes = ['b', 'r', 'u'] + valid_string_prefixes = ['b', 'r', 'u'] if version_info >= (3, 0): - _valid_string_prefixes.append('br') + valid_string_prefixes.append('br') + result = {''} if version_info >= (3, 6) and include_fstring: - _valid_string_prefixes += ['f', 'fr'] + f = ['f', 'fr'] + if only_fstring: + valid_string_prefixes = f + result = set() + else: + valid_string_prefixes += f + elif only_fstring: + return set() # if we add binary f-strings, add: ['fb', 'fbr'] - result = set(['']) - for prefix in _valid_string_prefixes: + for prefix in valid_string_prefixes: for t in _itertools.permutations(prefix): # create a list with upper and lower versions of each # character @@ -183,10 +186,11 @@ def _create_token_collection(version_info): StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n')) pseudo_extra_pool = [Comment, Triple] + all_quotes = '"', "'", '"""', "'''" if fstring_prefixes: - pseudo_extra_pool.append(FStringStart) + pseudo_extra_pool.append(FStringStart + group(*all_quotes)) - PseudoExtras = group(r'\\\r?\n|\Z', pseudo_extra_pool) + PseudoExtras = group(r'\\\r?\n|\Z', *pseudo_extra_pool) PseudoToken = group(Whitespace, capture=True) + \ group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) @@ -204,7 +208,7 @@ def _create_token_collection(version_info): # including the opening quotes. single_quoted = set() triple_quoted = set() - fstring_endpats = {} + fstring_pattern_map = {} for t in possible_prefixes: for quote in '"', "'": single_quoted.add(t + quote) @@ -213,15 +217,15 @@ def _create_token_collection(version_info): triple_quoted.add(t + quote) for t in fstring_prefixes: - for quote in '"', "'", '"""', "'''": - fstring_endpats[t + quote] = quote + for quote in all_quotes: + fstring_pattern_map[t + quote] = quote ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', 'finally', 'while', 'with', 'return') pseudo_token_compiled = _compile(PseudoToken) return TokenCollection( pseudo_token_compiled, single_quoted, triple_quoted, endpats, - fstring_endpats, ALWAYS_BREAK_TOKENS + fstring_pattern_map, ALWAYS_BREAK_TOKENS ) @@ -324,7 +328,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): token. This idea comes from lib2to3. The prefix contains all information that is irrelevant for the parser like newlines in parentheses or comments. """ - pseudo_token, single_quoted, triple_quoted, endpats, fstring_endpats, always_break_tokens, = \ + pseudo_token, single_quoted, triple_quoted, endpats, fstring_pattern_map, always_break_tokens, = \ _get_token_collection(version_info) paren_level = 0 # count parentheses indents = [0] @@ -372,6 +376,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): continue while pos < max: + assert not fstring_stack if fstring_stack: string, pos = _find_fstring_string(fstring_stack, line, pos) if string: @@ -474,8 +479,8 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): break else: # ordinary string yield PythonToken(STRING, token, spos, prefix) - elif token in fstring_endpats: - fstring_stack.append(FStringNode(fstring_endpats[token])) + elif token in fstring_pattern_map: # The start of an fstring. + fstring_stack.append(FStringNode(fstring_pattern_map[token])) yield PythonToken(FSTRING_START, token, spos, prefix) elif is_identifier(initial): # ordinary name if token in always_break_tokens: From b1aa7c6a7995955f957e1f70380ff947034a1d8c Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Sat, 31 Mar 2018 14:25:29 +0200 Subject: [PATCH 06/16] Cleanup a lot of details in the tokenizer for fstrings --- parso/python/grammar36.txt | 8 ++--- parso/python/tokenize.py | 60 +++++++++++++++++++++++++------------- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/parso/python/grammar36.txt b/parso/python/grammar36.txt index 60076eb..7984649 100644 --- a/parso/python/grammar36.txt +++ b/parso/python/grammar36.txt @@ -150,8 +150,8 @@ yield_expr: 'yield' [yield_arg] yield_arg: 'from' test | testlist strings: (STRING | fstring)+ -fstring: FSTRING_START fstring_content FSTRING_END -fstring_content: (FSTRING_STRING | fstring_expr)* +fstring: FSTRING_START fstring_content* FSTRING_END +fstring_content: (FSTRING_STRING | fstring_expr) fstring_conversion: '!' NAME -fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}' -fstring_format_spec: ':' fstring_content +fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}' +fstring_format_spec: ':' fstring_content* diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 9e1c33f..150f72a 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -69,7 +69,7 @@ def _all_string_prefixes(version_info, include_fstring=False, only_fstring=False if version_info >= (3, 0): valid_string_prefixes.append('br') - result = {''} + result = set(['']) if version_info >= (3, 6) and include_fstring: f = ['f', 'fr'] if only_fstring: @@ -260,27 +260,33 @@ class FStringNode(object): def close_parentheses(self, character): self.parentheses_count -= 1 - return self.parentheses_count == 0 def allow_multiline(self): - return len(self.quote == 3) + return len(self.quote) == 3 def is_in_expr(self): return self.parentheses_count and not self.in_format_spec -def _check_fstring_ending(fstring_stack, token): +def _check_fstring_ending(fstring_stack, token, from_start=False): fstring_end = float('inf') fstring_index = None for i, node in enumerate(fstring_stack): - try: - end = token.index(node.quote) - except ValueError: - pass - else: - if fstring_index is None or end < fstring_end: + if from_start: + if token.startswith(node.quote): fstring_index = i - fstring_end = end + fstring_end = len(node.quote) + else: + continue + else: + try: + end = token.index(node.quote) + except ValueError: + pass + else: + if fstring_index is None or end < fstring_end: + fstring_index = i + fstring_end = end return fstring_index, fstring_end @@ -296,7 +302,7 @@ def _find_fstring_string(fstring_stack, line, pos): else: match = fstring_string_single_line.match(line, pos) if match is None: - string = fstring_stack.previous_lines + string = fstring_stack[-1].previous_lines else: string = match.group(0) for fstring_stack_node in fstring_stack: @@ -307,10 +313,12 @@ def _find_fstring_string(fstring_stack, line, pos): new_pos += len(string) if allow_multiline and string.endswith('\n'): - fstring_stack.previous_lines += string + fstring_stack[-1].previous_lines += string string = '' else: - string = fstring_stack_node.previous_lines + string + string = fstring_stack[-1].previous_lines + string + + fstring_stack[-1].previous_lines = '' return string, new_pos @@ -376,14 +384,27 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): continue while pos < max: - assert not fstring_stack if fstring_stack: string, pos = _find_fstring_string(fstring_stack, line, pos) if string: - fstring_stack.previous_lines = '' yield PythonToken(FSTRING_STRING, string, (lnum, pos), '') continue + if pos < max: + rest = line[pos:] + fstring_index, end = _check_fstring_ending(fstring_stack, rest, from_start=True) + + if fstring_index is not None: + yield PythonToken( + FSTRING_END, + fstring_stack[fstring_index].quote, + (lnum, pos), + prefix='' + ) + del fstring_stack[fstring_index:] + pos += end + continue + pseudomatch = pseudo_token.match(line, pos) if not pseudomatch: # scan for tokens txt = line[pos:] @@ -436,7 +457,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): (lnum, spos[1] + 1), prefix='' ) - del fstring_index[fstring_index:] + del fstring_stack[fstring_index:] pos -= len(token) - end continue @@ -500,13 +521,12 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): else: if token in '([{': if fstring_stack: - fstring_stack[-1].open_bracket(token) + fstring_stack[-1].open_parentheses(token) else: paren_level += 1 elif token in ')]}': if fstring_stack: - if fstring_stack[-1].close_parentheses(token): - fstring_stack.pop() + fstring_stack[-1].close_parentheses(token) else: paren_level -= 1 elif token == ':' and fstring_stack \ From 97f042c6badd6599f06bdf0c6427e3417349a2d7 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Sat, 31 Mar 2018 14:26:12 +0200 Subject: [PATCH 07/16] Remove clutter from the grammar --- parso/python/grammar36.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parso/python/grammar36.txt b/parso/python/grammar36.txt index 7984649..7d112f7 100644 --- a/parso/python/grammar36.txt +++ b/parso/python/grammar36.txt @@ -151,7 +151,7 @@ yield_arg: 'from' test | testlist strings: (STRING | fstring)+ fstring: FSTRING_START fstring_content* FSTRING_END -fstring_content: (FSTRING_STRING | fstring_expr) +fstring_content: FSTRING_STRING | fstring_expr fstring_conversion: '!' NAME fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}' fstring_format_spec: ':' fstring_content* From 56b3e2cdc835476b60c0b80e7585e3a7c159c2e3 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Thu, 5 Apr 2018 00:45:03 +0200 Subject: [PATCH 08/16] Also use the fstring modfications for the 3.7 grammar --- parso/python/grammar37.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/parso/python/grammar37.txt b/parso/python/grammar37.txt index e76147e..7d112f7 100644 --- a/parso/python/grammar37.txt +++ b/parso/python/grammar37.txt @@ -108,7 +108,7 @@ atom_expr: ['await'] atom trailer* atom: ('(' [yield_expr|testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dictorsetmaker] '}' | - NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') + NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False') testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME subscriptlist: subscript (',' subscript)* [','] @@ -148,3 +148,10 @@ encoding_decl: NAME yield_expr: 'yield' [yield_arg] yield_arg: 'from' test | testlist + +strings: (STRING | fstring)+ +fstring: FSTRING_START fstring_content* FSTRING_END +fstring_content: FSTRING_STRING | fstring_expr +fstring_conversion: '!' NAME +fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}' +fstring_format_spec: ':' fstring_content* From 7c7f4f4e545e0070b9eb1e71cb106e48a9aa2421 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Thu, 5 Apr 2018 00:45:23 +0200 Subject: [PATCH 09/16] Fix a test --- test/test_tokenize.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/test_tokenize.py b/test/test_tokenize.py index bd62d95..2951380 100644 --- a/test/test_tokenize.py +++ b/test/test_tokenize.py @@ -7,7 +7,8 @@ import pytest from parso._compatibility import py_version from parso.utils import split_lines, parse_version_string from parso.python.token import ( - NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER, ERROR_DEDENT) + NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER, ERROR_DEDENT, + FSTRING_START) from parso.python import tokenize from parso import parse from parso.python.tokenize import PythonToken @@ -162,8 +163,9 @@ def test_ur_literals(): token_list = _get_token_list(literal) typ, result_literal, _, _ = token_list[0] if is_literal: - assert typ == STRING - assert result_literal == literal + if typ != FSTRING_START: + assert typ == STRING + assert result_literal == literal else: assert typ == NAME @@ -175,6 +177,7 @@ def test_ur_literals(): # Starting with Python 3.3 this ordering is also possible. if py_version >= 33: check('Rb""') + # Starting with Python 3.6 format strings where introduced. check('fr""', is_literal=py_version >= 36) check('rF""', is_literal=py_version >= 36) From a3e280c2b907be417d64c84e6c4d0c83d789be2b Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Thu, 5 Apr 2018 09:55:19 +0200 Subject: [PATCH 10/16] Use strings as a non-terminal symbol in all grammars This makes it easier to write the same logic for all Python versions --- parso/python/grammar26.txt | 3 ++- parso/python/grammar27.txt | 3 ++- parso/python/grammar33.txt | 3 ++- parso/python/grammar34.txt | 3 ++- parso/python/grammar35.txt | 3 ++- parso/python/parser.py | 4 +++- parso/python/tree.py | 9 +++++++++ 7 files changed, 22 insertions(+), 6 deletions(-) diff --git a/parso/python/grammar26.txt b/parso/python/grammar26.txt index b972a41..d9cede2 100644 --- a/parso/python/grammar26.txt +++ b/parso/python/grammar26.txt @@ -119,7 +119,8 @@ atom: ('(' [yield_expr|testlist_comp] ')' | '[' [listmaker] ']' | '{' [dictorsetmaker] '}' | '`' testlist1 '`' | - NAME | NUMBER | STRING+) + NAME | NUMBER | strings) +strings: STRING+ listmaker: test ( list_for | (',' test)* [','] ) # Dave: Renamed testlist_gexpr to testlist_comp, because in 2.7+ this is the # default. It's more consistent like this. diff --git a/parso/python/grammar27.txt b/parso/python/grammar27.txt index 4c3f33d..359f12b 100644 --- a/parso/python/grammar27.txt +++ b/parso/python/grammar27.txt @@ -104,7 +104,8 @@ atom: ('(' [yield_expr|testlist_comp] ')' | '[' [listmaker] ']' | '{' [dictorsetmaker] '}' | '`' testlist1 '`' | - NAME | NUMBER | STRING+) + NAME | NUMBER | strings) +strings: STRING+ listmaker: test ( list_for | (',' test)* [','] ) testlist_comp: test ( comp_for | (',' test)* [','] ) lambdef: 'lambda' [varargslist] ':' test diff --git a/parso/python/grammar33.txt b/parso/python/grammar33.txt index d7aaffd..3a55809 100644 --- a/parso/python/grammar33.txt +++ b/parso/python/grammar33.txt @@ -103,7 +103,8 @@ power: atom trailer* ['**' factor] atom: ('(' [yield_expr|testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dictorsetmaker] '}' | - NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') + NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False') +strings: STRING+ testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME subscriptlist: subscript (',' subscript)* [','] diff --git a/parso/python/grammar34.txt b/parso/python/grammar34.txt index 05c3181..324bba1 100644 --- a/parso/python/grammar34.txt +++ b/parso/python/grammar34.txt @@ -103,7 +103,8 @@ power: atom trailer* ['**' factor] atom: ('(' [yield_expr|testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dictorsetmaker] '}' | - NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') + NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False') +strings: STRING+ testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME subscriptlist: subscript (',' subscript)* [','] diff --git a/parso/python/grammar35.txt b/parso/python/grammar35.txt index c38217f..5868b8f 100644 --- a/parso/python/grammar35.txt +++ b/parso/python/grammar35.txt @@ -110,7 +110,8 @@ atom_expr: ['await'] atom trailer* atom: ('(' [yield_expr|testlist_comp] ')' | '[' [testlist_comp] ']' | '{' [dictorsetmaker] '}' | - NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') + NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False') +strings: STRING+ testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME subscriptlist: subscript (',' subscript)* [','] diff --git a/parso/python/parser.py b/parso/python/parser.py index 1897f53..4eb9241 100644 --- a/parso/python/parser.py +++ b/parso/python/parser.py @@ -1,6 +1,6 @@ from parso.python import tree from parso.python.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER, - STRING, tok_name, NAME) + STRING, tok_name, NAME, FSTRING_STRING) from parso.parser import BaseParser from parso.pgen2.parse import token_to_ilabel @@ -129,6 +129,8 @@ class Parser(BaseParser): return tree.Newline(value, start_pos, prefix) elif type == ENDMARKER: return tree.EndMarker(value, start_pos, prefix) + elif type == FSTRING_STRING: + return tree.FStringString(value, start_pos, prefix) else: return tree.Operator(value, start_pos, prefix) diff --git a/parso/python/tree.py b/parso/python/tree.py index eb97780..075505f 100644 --- a/parso/python/tree.py +++ b/parso/python/tree.py @@ -262,6 +262,15 @@ class String(Literal): return match.group(2)[:-len(match.group(1))] +class FStringString(Leaf): + """ + f-strings contain f-string expressions and normal python strings. These are + the string parts of f-strings. + """ + type = 'fstring_string' + __slots__ = () + + class _StringComparisonMixin(object): def __eq__(self, other): """ From 9d2ce4bcd4c1763ee0ce7451bda123864d4976b6 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Fri, 6 Apr 2018 09:50:07 +0200 Subject: [PATCH 11/16] Fix a few fstring error gatherings --- parso/python/errors.py | 20 +++++++++++++++++++- parso/python/grammar36.txt | 2 +- parso/python/tokenize.py | 2 +- test/failing_examples.py | 2 +- test/test_python_errors.py | 14 ++++++++++++++ 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/parso/python/errors.py b/parso/python/errors.py index fe1e273..8dc99b7 100644 --- a/parso/python/errors.py +++ b/parso/python/errors.py @@ -843,7 +843,7 @@ class _TryStmtRule(SyntaxRule): self.add_issue(default_except, message=self.message) -@ErrorFinder.register_rule(type='string') +@ErrorFinder.register_rule(type='fstring') class _FStringRule(SyntaxRule): _fstring_grammar = None message_empty = "f-string: empty expression not allowed" # f'{}' @@ -864,7 +864,25 @@ class _FStringRule(SyntaxRule): cls._fstring_grammar = parso.load_grammar(language='python-f-string') return cls._fstring_grammar + def _check_type(self, fstring_string): + index = -1 + value = fstring_string.value + while True: + index = value.find('}', index + 1) + if index == -1: + break # No further } found, we're finished. + elif index + 1 != len(value) and value[index + 1]: + # It's }}, which is totally ok. + index += 1 + else: + self.add_issue(fstring_string, message=self.message_single_closing) + def is_issue(self, fstring): + for fstring_content in fstring.children[1:-1]: + if fstring_content.type == 'fstring_string': + self._check_type(fstring_content) + return + print(fstring) if 'f' not in fstring.string_prefix.lower(): return diff --git a/parso/python/grammar36.txt b/parso/python/grammar36.txt index 7d112f7..b82c1fe 100644 --- a/parso/python/grammar36.txt +++ b/parso/python/grammar36.txt @@ -153,5 +153,5 @@ strings: (STRING | fstring)+ fstring: FSTRING_START fstring_content* FSTRING_END fstring_content: FSTRING_STRING | fstring_expr fstring_conversion: '!' NAME -fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}' +fstring_expr: '{' testlist_comp [ fstring_conversion ] [ fstring_format_spec ] '}' fstring_format_spec: ':' fstring_content* diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 150f72a..a410287 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -166,7 +166,7 @@ def _create_token_collection(version_info): # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get # recognized as two instances of =). - Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", + Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"//=?", r"->", r"[+\-*/%&@`|^=<>]=?", r"~") diff --git a/test/failing_examples.py b/test/failing_examples.py index 4227679..25e93ca 100644 --- a/test/failing_examples.py +++ b/test/failing_examples.py @@ -141,7 +141,7 @@ FAILING_EXAMPLES = [ # f-strings 'f"{}"', - 'f"{\\}"', + r'f"{\}"', 'f"{\'\\\'}"', 'f"{#}"', "f'{1!b}'", diff --git a/test/test_python_errors.py b/test/test_python_errors.py index b724a41..ca7be7f 100644 --- a/test/test_python_errors.py +++ b/test/test_python_errors.py @@ -114,6 +114,20 @@ def _get_actual_exception(code): # Python 3.4/3.4 have a bit of a different warning than 3.5/3.6 in # certain places. But in others this error makes sense. return [wanted, "SyntaxError: can't use starred expression here"], line_nr + elif wanted == 'SyntaxError: f-string: unterminated string': + wanted = 'SyntaxError: EOL while scanning string literal' + elif wanted == 'SyntaxError: f-string expression part cannot include a backslash': + return [ + wanted, + "SyntaxError: EOL while scanning string literal", + "SyntaxError: unexpected character after line continuation character", + ], line_nr + elif wanted == 'SyntaxError: f-string: empty expression not allowed': + wanted = 'SyntaxError: invalid syntax' + elif wanted == "SyntaxError: f-string expression part cannot include '#'": + wanted = 'SyntaxError: invalid syntax' + elif wanted == "SyntaxError: f-string: expecting '}'": + wanted = 'SyntaxError: EOL while scanning string literal' return [wanted], line_nr From 0d96b125669635dce2d16b383d9cde509853ef0f Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Fri, 6 Apr 2018 09:59:15 +0200 Subject: [PATCH 12/16] Fix the fstring syntax if there's a conversion with exclamation mark --- parso/python/errors.py | 14 ++++++++++++-- parso/python/tokenize.py | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/parso/python/errors.py b/parso/python/errors.py index 8dc99b7..48d4f4e 100644 --- a/parso/python/errors.py +++ b/parso/python/errors.py @@ -864,7 +864,7 @@ class _FStringRule(SyntaxRule): cls._fstring_grammar = parso.load_grammar(language='python-f-string') return cls._fstring_grammar - def _check_type(self, fstring_string): + def _check_string_part(self, fstring_string): index = -1 value = fstring_string.value while True: @@ -877,10 +877,20 @@ class _FStringRule(SyntaxRule): else: self.add_issue(fstring_string, message=self.message_single_closing) + def _check_fstring_expr(self, fstring_expr): + conversion = fstring_expr.children[2] + if conversion.type == 'fstring_conversion': + name = conversion.children[1] + if name.value not in ('s', 'r', 'a'): + self.add_issue(name, message=self.message_conversion) + def is_issue(self, fstring): for fstring_content in fstring.children[1:-1]: if fstring_content.type == 'fstring_string': - self._check_type(fstring_content) + self._check_string_part(fstring_content) + else: + assert fstring_content.type == 'fstring_expr' + self._check_fstring_expr(fstring_content) return print(fstring) if 'f' not in fstring.string_prefix.lower(): diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index a410287..10612ce 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -168,7 +168,7 @@ def _create_token_collection(version_info): # recognized as two instances of =). Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"//=?", r"->", - r"[+\-*/%&@`|^=<>]=?", + r"[+\-*/%&@`|^!=<>]=?", r"~") Bracket = '[][(){}]' From afb71dc762875def5646d91f29ae3c1c1a1572fe Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Fri, 6 Apr 2018 20:29:22 +0200 Subject: [PATCH 13/16] Remove the f-string rules and replace them with new ones --- parso/python/errors.py | 106 ++++++--------------------------------- parso/python/tokenize.py | 2 +- 2 files changed, 16 insertions(+), 92 deletions(-) diff --git a/parso/python/errors.py b/parso/python/errors.py index 48d4f4e..c50fa4f 100644 --- a/parso/python/errors.py +++ b/parso/python/errors.py @@ -846,23 +846,12 @@ class _TryStmtRule(SyntaxRule): @ErrorFinder.register_rule(type='fstring') class _FStringRule(SyntaxRule): _fstring_grammar = None - message_empty = "f-string: empty expression not allowed" # f'{}' message_single_closing = "f-string: single '}' is not allowed" # f'}' message_nested = "f-string: expressions nested too deeply" - message_backslash = "f-string expression part cannot include a backslash" # f'{"\"}' or f'{"\\"}' - message_comment = "f-string expression part cannot include '#'" # f'{#}' - message_unterminated_string = "f-string: unterminated string" # f'{"}' message_conversion = "f-string: invalid conversion character: expected 's', 'r', or 'a'" - message_incomplete = "f-string: expecting '}'" # f'{' - message_syntax = "invalid syntax" - @classmethod - def _load_grammar(cls): - import parso - - if cls._fstring_grammar is None: - cls._fstring_grammar = parso.load_grammar(language='python-f-string') - return cls._fstring_grammar + def _check_format_spec(self, format_spec, depth): + self._check_fstring_contents(format_spec.children[1:], depth) def _check_string_part(self, fstring_string): index = -1 @@ -877,95 +866,30 @@ class _FStringRule(SyntaxRule): else: self.add_issue(fstring_string, message=self.message_single_closing) - def _check_fstring_expr(self, fstring_expr): + def _check_fstring_expr(self, fstring_expr, depth): + if depth >= 2: + self.add_issue(fstring_expr, message=self.message_nested) + conversion = fstring_expr.children[2] if conversion.type == 'fstring_conversion': name = conversion.children[1] if name.value not in ('s', 'r', 'a'): self.add_issue(name, message=self.message_conversion) + format_spec = fstring_expr.children[-2] + if format_spec.type == 'fstring_format_spec': + self._check_format_spec(format_spec, depth + 1) + def is_issue(self, fstring): - for fstring_content in fstring.children[1:-1]: + self._check_fstring_contents(fstring.children[1:-1]) + + def _check_fstring_contents(self, children, depth=0): + for fstring_content in children: if fstring_content.type == 'fstring_string': self._check_string_part(fstring_content) else: assert fstring_content.type == 'fstring_expr' - self._check_fstring_expr(fstring_content) - return - print(fstring) - if 'f' not in fstring.string_prefix.lower(): - return - - parsed = self._load_grammar().parse_leaf(fstring) - for child in parsed.children: - if child.type == 'expression': - self._check_expression(child) - elif child.type == 'error_node': - next_ = child.get_next_leaf() - if next_.type == 'error_leaf' and next_.original_type == 'unterminated_string': - self.add_issue(next_, message=self.message_unterminated_string) - # At this point nothing more is comming except the error - # leaf that we've already checked here. - break - self.add_issue(child, message=self.message_incomplete) - elif child.type == 'error_leaf': - self.add_issue(child, message=self.message_single_closing) - - def _check_python_expr(self, python_expr): - value = python_expr.value - if '\\' in value: - self.add_issue(python_expr, message=self.message_backslash) - return - if '#' in value: - self.add_issue(python_expr, message=self.message_comment) - return - if re.match('\s*$', value) is not None: - self.add_issue(python_expr, message=self.message_empty) - return - - # This is now nested parsing. We parsed the fstring and now - # we're parsing Python again. - try: - # CPython has a bit of a special ways to parse Python code within - # f-strings. It wraps the code in brackets to make sure that - # whitespace doesn't make problems (indentation/newlines). - # Just use that algorithm as well here and adapt start positions. - start_pos = python_expr.start_pos - start_pos = start_pos[0], start_pos[1] - 1 - eval_input = self._normalizer.grammar._parse( - '(%s)' % value, - start_symbol='eval_input', - start_pos=start_pos, - error_recovery=False - ) - except ParserSyntaxError as e: - self.add_issue(e.error_leaf, message=self.message_syntax) - return - - issues = self._normalizer.grammar.iter_errors(eval_input) - self._normalizer.issues += issues - - def _check_format_spec(self, format_spec): - for expression in format_spec.children[1:]: - nested_format_spec = expression.children[-2] - if nested_format_spec.type == 'format_spec': - if len(nested_format_spec.children) > 1: - self.add_issue( - nested_format_spec.children[1], - message=self.message_nested - ) - - self._check_expression(expression) - - def _check_expression(self, expression): - for c in expression.children: - if c.type == 'python_expr': - self._check_python_expr(c) - elif c.type == 'conversion': - if c.value not in ('s', 'r', 'a'): - self.add_issue(c, message=self.message_conversion) - elif c.type == 'format_spec': - self._check_format_spec(c) + self._check_fstring_expr(fstring_content, depth) class _CheckAssignmentRule(SyntaxRule): diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 10612ce..6c55c9a 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -265,7 +265,7 @@ class FStringNode(object): return len(self.quote) == 3 def is_in_expr(self): - return self.parentheses_count and not self.in_format_spec + return self.parentheses_count def _check_fstring_ending(fstring_stack, token, from_start=False): From 9941348ec6a1a2da6869abc07a49b7e84247b107 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Fri, 6 Apr 2018 20:30:07 +0200 Subject: [PATCH 14/16] Add python 3.7 to tox --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 4b6d3b2..344aacc 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py26, py27, py33, py34, py35, py36 +envlist = py26, py27, py33, py34, py35, py36, py37 [testenv] deps = {env:_PARSO_TEST_PYTEST_DEP:pytest>=3.0.7} From 8f1a436ba1b4e081a8d427dcc9439ab9c7a9de10 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Sat, 7 Apr 2018 02:11:26 +0200 Subject: [PATCH 15/16] Remove the old f-string grammar and fix the tests with the new syntax --- parso/grammar.py | 34 ------- parso/pgen2/parse.py | 4 +- parso/python/fstring.py | 211 --------------------------------------- parso/python/tokenize.py | 62 +++++++----- test/test_fstring.py | 48 +++++---- 5 files changed, 66 insertions(+), 293 deletions(-) diff --git a/parso/grammar.py b/parso/grammar.py index e5abf81..c825b55 100644 --- a/parso/grammar.py +++ b/parso/grammar.py @@ -12,7 +12,6 @@ from parso.parser import BaseParser from parso.python.parser import Parser as PythonParser from parso.python.errors import ErrorFinderConfig from parso.python import pep8 -from parso.python import fstring _loaded_grammars = {} @@ -186,7 +185,6 @@ class Grammar(object): normalizer.walk(node) return normalizer.issues - def __repr__(self): labels = self._pgen_grammar.number2symbol.values() txt = ' '.join(list(labels)[:3]) + ' ...' @@ -215,34 +213,6 @@ class PythonGrammar(Grammar): return tokenize(code, self.version_info) -class PythonFStringGrammar(Grammar): - _token_namespace = fstring.TokenNamespace - _start_symbol = 'fstring' - - def __init__(self): - super(PythonFStringGrammar, self).__init__( - text=fstring.GRAMMAR, - tokenizer=fstring.tokenize, - parser=fstring.Parser - ) - - def parse(self, code, **kwargs): - return self._parse(code, **kwargs) - - def _parse(self, code, error_recovery=True, start_pos=(1, 0)): - tokens = self._tokenizer(code, start_pos=start_pos) - p = self._parser( - self._pgen_grammar, - error_recovery=error_recovery, - start_symbol=self._start_symbol, - ) - return p.parse(tokens=tokens) - - def parse_leaf(self, leaf, error_recovery=True): - code = leaf._get_payload() - return self.parse(code, error_recovery=True, start_pos=leaf.start_pos) - - def load_grammar(**kwargs): """ Loads a :py:class:`parso.Grammar`. The default version is the current Python @@ -273,10 +243,6 @@ def load_grammar(**kwargs): except FileNotFoundError: message = "Python version %s is currently not supported." % version raise NotImplementedError(message) - elif language == 'python-f-string': - if version is not None: - raise NotImplementedError("Currently different versions are not supported.") - return PythonFStringGrammar() else: raise NotImplementedError("No support for language %s." % language) diff --git a/parso/pgen2/parse.py b/parso/pgen2/parse.py index aaacfce..e2369d1 100644 --- a/parso/pgen2/parse.py +++ b/parso/pgen2/parse.py @@ -124,7 +124,9 @@ class PgenParser(object): self.error_recovery = error_recovery def parse(self, tokens): - for type_, value, start_pos, prefix in tokens: + for tok in tokens: + print(tok) + type_, value, start_pos, prefix = tok if self.add_token(type_, value, start_pos, prefix): break else: diff --git a/parso/python/fstring.py b/parso/python/fstring.py index a8fe7b4..e69de29 100644 --- a/parso/python/fstring.py +++ b/parso/python/fstring.py @@ -1,211 +0,0 @@ -import re - -from itertools import count -from parso.utils import PythonVersionInfo -from parso.utils import split_lines -from parso.python.tokenize import Token -from parso import parser -from parso.tree import TypedLeaf, ErrorNode, ErrorLeaf - -version36 = PythonVersionInfo(3, 6) - - -class TokenNamespace: - _c = count() - LBRACE = next(_c) - RBRACE = next(_c) - ENDMARKER = next(_c) - COLON = next(_c) - CONVERSION = next(_c) - PYTHON_EXPR = next(_c) - EXCLAMATION_MARK = next(_c) - UNTERMINATED_STRING = next(_c) - - token_map = dict((v, k) for k, v in locals().items() if not k.startswith('_')) - - @classmethod - def generate_token_id(cls, string): - if string == '{': - return cls.LBRACE - elif string == '}': - return cls.RBRACE - elif string == '!': - return cls.EXCLAMATION_MARK - elif string == ':': - return cls.COLON - return getattr(cls, string) - - -GRAMMAR = """ -fstring: expression* ENDMARKER -format_spec: ':' expression* -expression: '{' PYTHON_EXPR [ '!' CONVERSION ] [ format_spec ] '}' -""" - -_prefix = r'((?:[^{}]+)*)' -_expr = _prefix + r'(\{|\}|$)' -_in_expr = r'([^{}\[\]:"\'!]*)(.?)' -# There's only one conversion character allowed. But the rules have to be -# checked later anyway, so allow more here. This makes error recovery nicer. -_conversion = r'([^={}:]*)(.?)' - -_compiled_expr = re.compile(_expr) -_compiled_in_expr = re.compile(_in_expr) -_compiled_conversion = re.compile(_conversion) - - -def tokenize(code, start_pos=(1, 0)): - def add_to_pos(string): - lines = split_lines(string) - l = len(lines[-1]) - if len(lines) > 1: - start_pos[0] += len(lines) - 1 - start_pos[1] = l - else: - start_pos[1] += l - - def tok(value, type=None, prefix=''): - if type is None: - type = TokenNamespace.generate_token_id(value) - - add_to_pos(prefix) - token = Token(type, value, tuple(start_pos), prefix) - add_to_pos(value) - return token - - start = 0 - recursion_level = 0 - added_prefix = '' - start_pos = list(start_pos) - while True: - match = _compiled_expr.match(code, start) - prefix = added_prefix + match.group(1) - found = match.group(2) - start = match.end() - if not found: - # We're at the end. - break - - if found == '}': - if recursion_level == 0 and len(code) > start and code[start] == '}': - # This is a }} escape. - added_prefix = prefix + '}}' - start += 1 - continue - - recursion_level = max(0, recursion_level - 1) - yield tok(found, prefix=prefix) - added_prefix = '' - else: - assert found == '{' - if recursion_level == 0 and len(code) > start and code[start] == '{': - # This is a {{ escape. - added_prefix = prefix + '{{' - start += 1 - continue - - recursion_level += 1 - yield tok(found, prefix=prefix) - added_prefix = '' - - expression = '' - squared_count = 0 - curly_count = 0 - while True: - expr_match = _compiled_in_expr.match(code, start) - expression += expr_match.group(1) - found = expr_match.group(2) - start = expr_match.end() - - if found == '{': - curly_count += 1 - expression += found - elif found == '}' and curly_count > 0: - curly_count -= 1 - expression += found - elif found == '[': - squared_count += 1 - expression += found - elif found == ']': - # Use a max function here, because the Python code might - # just have syntax errors. - squared_count = max(0, squared_count - 1) - expression += found - elif found == ':' and (squared_count or curly_count): - expression += found - elif found in ('"', "'"): - search = found - if len(code) > start + 1 and \ - code[start] == found == code[start+1]: - search *= 3 - start += 2 - - index = code.find(search, start) - if index == -1: - yield tok(expression, type=TokenNamespace.PYTHON_EXPR) - yield tok( - found + code[start:], - type=TokenNamespace.UNTERMINATED_STRING, - ) - start = len(code) - break - expression += found + code[start:index+1] - start = index + 1 - elif found == '!' and len(code) > start and code[start] == '=': - # This is a python `!=` and not a conversion. - expression += found - else: - yield tok(expression, type=TokenNamespace.PYTHON_EXPR) - if found: - yield tok(found) - break - - if found == '!': - conversion_match = _compiled_conversion.match(code, start) - found = conversion_match.group(2) - start = conversion_match.end() - yield tok(conversion_match.group(1), type=TokenNamespace.CONVERSION) - if found: - yield tok(found) - if found == '}': - recursion_level -= 1 - - # We don't need to handle everything after ':', because that is - # basically new tokens. - - yield tok('', type=TokenNamespace.ENDMARKER, prefix=prefix) - - -class Parser(parser.BaseParser): - def parse(self, tokens): - node = super(Parser, self).parse(tokens) - if isinstance(node, self.default_leaf): # Is an endmarker. - # If there's no curly braces we get back a non-module. We always - # want an fstring. - node = self.default_node('fstring', [node]) - - return node - - def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos): - # TODO this is so ugly. - leaf_type = TokenNamespace.token_map[type].lower() - return TypedLeaf(leaf_type, value, start_pos, prefix) - - def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix, - add_token_callback): - if not self._error_recovery: - return super(Parser, self).error_recovery( - pgen_grammar, stack, arcs, typ, value, start_pos, prefix, - add_token_callback - ) - - token_type = TokenNamespace.token_map[typ].lower() - if len(stack) == 1: - error_leaf = ErrorLeaf(token_type, value, start_pos, prefix) - stack[0][2][1].append(error_leaf) - else: - dfa, state, (type_, nodes) = stack[1] - stack[0][2][1].append(ErrorNode(nodes)) - stack[1:] = [] - - add_token_callback(typ, value, start_pos, prefix) diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 6c55c9a..7d72fc6 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -106,8 +106,8 @@ def _get_token_collection(version_info): return result -fstring_string_single_line = _compile(r'(?:[^{\r\n]+|\{\{)+') -fstring_string_multi_line = _compile(r'(?:[^{]+|\{\{)+') +fstring_string_single_line = _compile(r'(?:[^{}\r\n]+|\{\{|\}\})+') +fstring_string_multi_line = _compile(r'(?:[^{}]+|\{\{|\}\})+') def _create_token_collection(version_info): @@ -253,7 +253,10 @@ class FStringNode(object): self.quote = quote self.parentheses_count = 0 self.previous_lines = '' - self.in_format_spec = False + self.last_string_start_pos = None + # In the syntax there can be multiple format_spec's nested: + # {x:{y:3}} + self.format_spec_count = 0 def open_parentheses(self, character): self.parentheses_count += 1 @@ -265,7 +268,7 @@ class FStringNode(object): return len(self.quote) == 3 def is_in_expr(self): - return self.parentheses_count + return (self.parentheses_count - self.format_spec_count) > 0 def _check_fstring_ending(fstring_stack, token, from_start=False): @@ -290,7 +293,7 @@ def _check_fstring_ending(fstring_stack, token, from_start=False): return fstring_index, fstring_end -def _find_fstring_string(fstring_stack, line, pos): +def _find_fstring_string(fstring_stack, line, lnum, pos): tos = fstring_stack[-1] if tos.is_in_expr(): return '', pos @@ -302,8 +305,12 @@ def _find_fstring_string(fstring_stack, line, pos): else: match = fstring_string_single_line.match(line, pos) if match is None: - string = fstring_stack[-1].previous_lines + string = tos.previous_lines else: + print(match, lnum, pos, repr(tos.previous_lines)) + if not tos.previous_lines: + tos.last_string_start_pos = (lnum, pos) + string = match.group(0) for fstring_stack_node in fstring_stack: try: @@ -313,12 +320,11 @@ def _find_fstring_string(fstring_stack, line, pos): new_pos += len(string) if allow_multiline and string.endswith('\n'): - fstring_stack[-1].previous_lines += string + tos.previous_lines += string string = '' else: - string = fstring_stack[-1].previous_lines + string + string = tos.previous_lines + string - fstring_stack[-1].previous_lines = '' return string, new_pos @@ -385,25 +391,31 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): while pos < max: if fstring_stack: - string, pos = _find_fstring_string(fstring_stack, line, pos) + string, pos = _find_fstring_string(fstring_stack, line, lnum, pos) if string: - yield PythonToken(FSTRING_STRING, string, (lnum, pos), '') + yield PythonToken( + FSTRING_STRING, string, + fstring_stack[-1].last_string_start_pos, '' + ) + fstring_stack[-1].previous_lines = '' continue - if pos < max: - rest = line[pos:] - fstring_index, end = _check_fstring_ending(fstring_stack, rest, from_start=True) + if pos == max: + break - if fstring_index is not None: - yield PythonToken( - FSTRING_END, - fstring_stack[fstring_index].quote, - (lnum, pos), - prefix='' - ) - del fstring_stack[fstring_index:] - pos += end - continue + rest = line[pos:] + fstring_index, end = _check_fstring_ending(fstring_stack, rest, from_start=True) + + if fstring_index is not None: + yield PythonToken( + FSTRING_END, + fstring_stack[fstring_index].quote, + (lnum, pos), + prefix='' + ) + del fstring_stack[fstring_index:] + pos += end + continue pseudomatch = pseudo_token.match(line, pos) if not pseudomatch: # scan for tokens @@ -531,7 +543,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): paren_level -= 1 elif token == ':' and fstring_stack \ and fstring_stack[-1].parentheses_count == 1: - fstring_stack[-1].in_format_spec = True + fstring_stack[-1].format_spec_count += 1 try: # This check is needed in any case to check if it's a valid diff --git a/test/test_fstring.py b/test/test_fstring.py index 936d7fb..59debd9 100644 --- a/test/test_fstring.py +++ b/test/test_fstring.py @@ -1,17 +1,18 @@ import pytest from parso import load_grammar, ParserSyntaxError -from parso.python.fstring import tokenize +from parso.python.tokenize import tokenize @pytest.fixture def grammar(): - return load_grammar(language="python-f-string") + return load_grammar(version='3.6') @pytest.mark.parametrize( 'code', [ '{1}', + '{1:}', '', '{1!a}', '{1!a:1}', @@ -26,22 +27,12 @@ def grammar(): '{{{1}', '1{{2{{3', '}}', - '{:}}}', - - # Invalid, but will be checked, later. - '{}', - '{1:}', - '{:}', - '{:1}', - '{!:}', - '{!}', - '{!a}', - '{1:{}}', - '{1:{:}}', ] ) def test_valid(code, grammar): - fstring = grammar.parse(code, error_recovery=False) + code = 'f"""%s"""' % code + module = grammar.parse(code, error_recovery=False) + fstring = module.children[0] assert fstring.type == 'fstring' assert fstring.get_code() == code @@ -52,24 +43,37 @@ def test_valid(code, grammar): '{', '{1!{a}}', '{!{a}}', + '{}', + '{:}', + '{:}}}', + '{:1}', + '{!:}', + '{!}', + '{!a}', + '{1:{}}', + '{1:{:}}', ] ) def test_invalid(code, grammar): + code = 'f"""%s"""' % code with pytest.raises(ParserSyntaxError): grammar.parse(code, error_recovery=False) # It should work with error recovery. - #grammar.parse(code, error_recovery=True) + grammar.parse(code, error_recovery=True) @pytest.mark.parametrize( - ('code', 'start_pos', 'positions'), [ + ('code', 'positions'), [ # 2 times 2, 5 because python expr and endmarker. - ('}{', (2, 3), [(2, 3), (2, 4), (2, 5), (2, 5)]), - (' :{ 1 : } ', (1, 0), [(1, 2), (1, 3), (1, 6), (1, 8), (1, 10)]), - ('\n{\nfoo\n }', (2, 1), [(3, 0), (3, 1), (5, 1), (5, 2)]), + ('f"}{"', [(1, 0), (1, 2), (1, 3), (1, 4), (1, 5)]), + ('f" :{ 1 : } "', [(1, 0), (1, 2), (1, 4), (1, 6), (1, 8), (1, 9), + (1, 10), (1, 11), (1, 12), (1, 13)]), + ('f"""\n {\nfoo\n }"""', [(1, 0), (1, 4), (2, 1), (3, 0), (4, 1), + (4, 2), (4, 5)]), ] ) -def test_tokenize_start_pos(code, start_pos, positions): - tokens = tokenize(code, start_pos) +def test_tokenize_start_pos(code, positions): + tokens = list(tokenize(code, version_info=(3, 6))) + print(tokens) assert positions == [p.start_pos for p in tokens] From cba4f2ccc1999a2ef415448d9fd97921e4f72717 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Sat, 7 Apr 2018 02:14:35 +0200 Subject: [PATCH 16/16] Fix the syntax errors from f-strings --- parso/python/errors.py | 19 +------------------ parso/python/fstring.py | 0 test/test_python_errors.py | 6 ++++-- 3 files changed, 5 insertions(+), 20 deletions(-) delete mode 100644 parso/python/fstring.py diff --git a/parso/python/errors.py b/parso/python/errors.py index c50fa4f..cfb8380 100644 --- a/parso/python/errors.py +++ b/parso/python/errors.py @@ -846,26 +846,12 @@ class _TryStmtRule(SyntaxRule): @ErrorFinder.register_rule(type='fstring') class _FStringRule(SyntaxRule): _fstring_grammar = None - message_single_closing = "f-string: single '}' is not allowed" # f'}' message_nested = "f-string: expressions nested too deeply" message_conversion = "f-string: invalid conversion character: expected 's', 'r', or 'a'" def _check_format_spec(self, format_spec, depth): self._check_fstring_contents(format_spec.children[1:], depth) - def _check_string_part(self, fstring_string): - index = -1 - value = fstring_string.value - while True: - index = value.find('}', index + 1) - if index == -1: - break # No further } found, we're finished. - elif index + 1 != len(value) and value[index + 1]: - # It's }}, which is totally ok. - index += 1 - else: - self.add_issue(fstring_string, message=self.message_single_closing) - def _check_fstring_expr(self, fstring_expr, depth): if depth >= 2: self.add_issue(fstring_expr, message=self.message_nested) @@ -885,10 +871,7 @@ class _FStringRule(SyntaxRule): def _check_fstring_contents(self, children, depth=0): for fstring_content in children: - if fstring_content.type == 'fstring_string': - self._check_string_part(fstring_content) - else: - assert fstring_content.type == 'fstring_expr' + if fstring_content.type == 'fstring_expr': self._check_fstring_expr(fstring_content, depth) diff --git a/parso/python/fstring.py b/parso/python/fstring.py deleted file mode 100644 index e69de29..0000000 diff --git a/test/test_python_errors.py b/test/test_python_errors.py index ca7be7f..67f3e1d 100644 --- a/test/test_python_errors.py +++ b/test/test_python_errors.py @@ -122,12 +122,14 @@ def _get_actual_exception(code): "SyntaxError: EOL while scanning string literal", "SyntaxError: unexpected character after line continuation character", ], line_nr + elif wanted == "SyntaxError: f-string: expecting '}'": + wanted = 'SyntaxError: EOL while scanning string literal' elif wanted == 'SyntaxError: f-string: empty expression not allowed': wanted = 'SyntaxError: invalid syntax' elif wanted == "SyntaxError: f-string expression part cannot include '#'": wanted = 'SyntaxError: invalid syntax' - elif wanted == "SyntaxError: f-string: expecting '}'": - wanted = 'SyntaxError: EOL while scanning string literal' + elif wanted == "SyntaxError: f-string: single '}' is not allowed": + wanted = 'SyntaxError: invalid syntax' return [wanted], line_nr