From d39aadc4ccaedbba040a32df11df99c439328003 Mon Sep 17 00:00:00 2001 From: Tim Hatch Date: Sun, 22 Nov 2020 04:37:04 -0800 Subject: [PATCH] Support named unicode characters in f-strings (#160) * Support named unicode characters in f-strings Fixes #154 The previous behavior misinterpreted the curly braces as enclosing an expression. This change does some cursory validation so we can still get parse errors in the most egregious cases, but does not validate that the names are actually valid, only that they are name-shaped and have a chance of being valid. The character names appear to obey a few rules: * Case insensitive * Name characters are `[A-Z0-9 \-]` * Whitespace before or after is not allowed * Whitespace in the middle may only be a single space between words * Dashes may occur at the start or middle of a word ```py f"\N{A B}" # might be legal f"\N{a b}" # equivalent to above f"\N{A B}" # no way f"\N{ A B }" # no way f"""\N{A B}""" # no way ``` For confirming this regex matches all (current) unicode character names: ```py import re import sys import unicodedata R = re.compile(r"[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*") for i in range(sys.maxunicode): try: name = unicodedata.name(chr(i)) except ValueError: # Some small values like 0 and 1 have no name, /shrug continue m = R.fullmatch(name) if m is None: print("FAIL", repr(name)) ``` * Improve tests for named unicode escapes --- parso/python/tokenize.py | 11 ++++++++--- test/test_fstring.py | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 6282275..0eff15d 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -110,9 +110,14 @@ def _get_token_collection(version_info): _create_token_collection(version_info) return result - -fstring_string_single_line = _compile(r'(?:\{\{|\}\}|\\(?:\r\n?|\n)|[^{}\r\n])+') -fstring_string_multi_line = _compile(r'(?:[^{}]+|\{\{|\}\})+') +unicode_character_name = r'[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*' +fstring_string_single_line = _compile( + r'(?:\{\{|\}\}|\\N\{' + unicode_character_name + + r'\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+' +) +fstring_string_multi_line = _compile( + r'(?:\{\{|\}\}|\\N\{' + unicode_character_name + r'\}|\\[^N]|[^{}\\])+' +) fstring_format_spec_single_line = _compile(r'(?:\\(?:\r\n?|\n)|[^{}\r\n])+') fstring_format_spec_multi_line = _compile(r'[^{}]+') diff --git a/test/test_fstring.py b/test/test_fstring.py index a17599d..c81d027 100644 --- a/test/test_fstring.py +++ b/test/test_fstring.py @@ -67,6 +67,17 @@ def grammar(): 'f"{x, *y}"', 'f"{*x, y}"', 'f"{x for x in [1]}"', + + # named unicode characters + 'f"\\N{BULLET}"', + 'f"\\N{FLEUR-DE-LIS}"', + 'f"\\N{NO ENTRY}"', + 'f"Combo {expr} and \\N{NO ENTRY}"', + 'f"\\N{NO ENTRY} and {expr}"', + 'f"\\N{no entry}"', + 'f"\\N{SOYOMBO LETTER -A}"', + 'f"\\N{DOMINO TILE HORIZONTAL-00-00}"', + 'f"""\\N{NO ENTRY}"""', ] ) def test_valid(code, grammar): @@ -104,6 +115,11 @@ def test_valid(code, grammar): # a newline without a line continuation inside a single-line string 'f"abc\ndef"', + + # various named unicode escapes that aren't name-shaped + 'f"\\N{ BULLET }"', + 'f"\\N{NO ENTRY}"', + 'f"""\\N{NO\nENTRY}"""', ] ) def test_invalid(code, grammar): @@ -122,6 +138,8 @@ def test_invalid(code, grammar): (1, 10), (1, 11), (1, 12), (1, 13)]), ('f"""\n {\nfoo\n }"""', [(1, 0), (1, 4), (2, 1), (3, 0), (4, 1), (4, 2), (4, 5)]), + ('f"\\N{NO ENTRY} and {expr}"', [(1, 0), (1, 2), (1, 19), (1, 20), + (1, 24), (1, 25), (1, 26)]), ] ) def test_tokenize_start_pos(code, positions):