For Python 2.7 and 3.4 pytest 5 doesn't work anymore

Prepare the 0.5.1 release
Fix name tokenizing for Python 2
2025-12-08 13:45:01 +08:00 · 2019-07-13 15:46:58 +02:00 · 2019-07-13 15:39:44 +02:00 · 2019-07-13 15:34:23 +02:00 · 2019-07-13 12:34:49 +02:00 · 2019-07-12 21:35:06 +02:00
9 changed files with 218 additions and 65 deletions
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -3,6 +3,12 @@
 Changelog
 ---------

+0.5.1 (2019-07-13)
++++++++++++++++++
+
+- Fix: Some unicode identifiers were not correctly tokenized
+- Fix: Line continuations in f-strings are now working
+
 0.5.0 (2019-06-20)
 ++++++++++++++++++

@@ -17,19 +23,19 @@ Changelog
 - Python 3.8 support
 - FileIO support, it's now possible to use abstract file IO, support is alpha

-0.3.4 (2018-02-13)
+0.3.4 (2019-02-13)
 +++++++++++++++++++

 - Fix an f-string tokenizer error

-0.3.3 (2018-02-06)
+0.3.3 (2019-02-06)
 +++++++++++++++++++

 - Fix async errors in the diff parser
 - A fix in iter_errors
 - This is a very small bugfix release

-0.3.2 (2018-01-24)
+0.3.2 (2019-01-24)
 +++++++++++++++++++

 - 20+ bugfixes in the diff parser and 3 in the tokenizer
--- a/parso/init.py
+++ b/parso/init.py
@@ -43,7 +43,7 @@ from parso.grammar import Grammar, load_grammar
 from parso.utils import split_lines, python_bytes_to_unicode


-__version__ = '0.5.0'
+__version__ = '0.5.1'


 def parse(code=None, **kwargs):
--- a/parso/grammar.py
+++ b/parso/grammar.py
@@ -57,7 +57,8 @@ class Grammar(object):
        :param str path: The path to the file you want to open. Only needed for caching.
        :param bool cache: Keeps a copy of the parser tree in RAM and on disk
            if a path is given. Returns the cached trees if the corresponding
-            files on disk have not changed.
+            files on disk have not changed. Note that this stores pickle files
+            on your file system (e.g. for Linux in ``~/.cache/parso/``).
        :param bool diff_cache: Diffs the cached python module against the new
            code and tries to parse only the parts that have changed. Returns
            the same (changed) module that is found in cache. Using this option
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -23,6 +23,9 @@ from parso._compatibility import py_version
 from parso.utils import split_lines


+# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
+MAX_UNICODE = '\U0010ffff'
+
 STRING = PythonTokenTypes.STRING
 NAME = PythonTokenTypes.NAME
 NUMBER = PythonTokenTypes.NUMBER
@@ -51,8 +54,13 @@ if py_version >= 30:
    # Python 3 has str.isidentifier() to check if a char is a valid identifier
    is_identifier = str.isidentifier
 else:
-    namechars = string.ascii_letters + '_'
-    is_identifier = lambda s: s in namechars
+    # Python 2 doesn't, but it's not that important anymore and if you tokenize
+    # Python 2 code with this, it's still ok. It's just that parsing Python 3
+    # code with this function is not 100% correct.
+    # This just means that Python 2 code matches a few identifiers too much,
+    # but that doesn't really matter.
+    def is_identifier(s):
+        return True


 def group(*choices, **kwargs):
@@ -118,9 +126,9 @@ def _get_token_collection(version_info):
        return result


-fstring_string_single_line = _compile(r'(?:[^{}\r\n]+|\{\{|\}\})+')
+fstring_string_single_line = _compile(r'(?:\{\{|\}\}|\\(?:\r\n?|\n)|[^{}\r\n])+')
 fstring_string_multi_line = _compile(r'(?:[^{}]+|\{\{|\}\})+')
-fstring_format_spec_single_line = _compile(r'[^{}\r\n]+')
+fstring_format_spec_single_line = _compile(r'(?:\\(?:\r\n?|\n)|[^{}\r\n])+')
 fstring_format_spec_multi_line = _compile(r'[^{}]+')


@@ -130,7 +138,16 @@ def _create_token_collection(version_info):
    Whitespace = r'[ \f\t]*'
    whitespace = _compile(Whitespace)
    Comment = r'#[^\r\n]*'
-    Name = r'\w+'
+    # Python 2 is pretty much not working properly anymore, we just ignore
+    # parsing unicode properly, which is fine, I guess.
+    if version_info[0] == 2:
+        Name = r'([A-Za-z_0-9]+)'
+    elif sys.version_info[0] == 2:
+        # Unfortunately the regex engine cannot deal with the regex below, so
+        # just use this one.
+        Name = r'(\w+)'
+    else:
+        Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)'

    if version_info >= (3, 6):
        Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
@@ -340,7 +357,9 @@ def _find_fstring_string(endpats, fstring_stack, line, lnum, pos):

    new_pos = pos
    new_pos += len(string)
-    if allow_multiline and (string.endswith('\n') or string.endswith('\r')):
+    # even if allow_multiline is False, we still need to check for trailing
+    # newlines, because a single-line f-string can contain line continuations
+    if string.endswith('\n') or string.endswith('\r'):
        tos.previous_lines += string
        string = ''
    else:
@@ -510,6 +529,24 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
            if (initial in numchars or                      # ordinary number
                    (initial == '.' and token != '.' and token != '...')):
                yield PythonToken(NUMBER, token, spos, prefix)
+            elif pseudomatch.group(3) is not None:            # ordinary name
+                if token in always_break_tokens:
+                    fstring_stack[:] = []
+                    paren_level = 0
+                    # We only want to dedent if the token is on a new line.
+                    if re.match(r'[ \f\t]*$', line[:start]):
+                        while True:
+                            indent = indents.pop()
+                            if indent > start:
+                                yield PythonToken(DEDENT, '', spos, '')
+                            else:
+                                indents.append(indent)
+                                break
+                if is_identifier(token):
+                    yield PythonToken(NAME, token, spos, prefix)
+                else:
+                    for t in _split_illegal_unicode_name(token, spos, prefix):
+                        yield t  # yield from Python 2
            elif initial in '\r\n':
                if any(not f.allow_multiline() for f in fstring_stack):
                    # Would use fstring_stack.clear, but that's not available
@@ -564,20 +601,6 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
            elif token in fstring_pattern_map:  # The start of an fstring.
                fstring_stack.append(FStringNode(fstring_pattern_map[token]))
                yield PythonToken(FSTRING_START, token, spos, prefix)
-            elif is_identifier(initial):                      # ordinary name
-                if token in always_break_tokens:
-                    fstring_stack[:] = []
-                    paren_level = 0
-                    # We only want to dedent if the token is on a new line.
-                    if re.match(r'[ \f\t]*$', line[:start]):
-                        while True:
-                            indent = indents.pop()
-                            if indent > start:
-                                yield PythonToken(DEDENT, '', spos, '')
-                            else:
-                                indents.append(indent)
-                                break
-                yield PythonToken(NAME, token, spos, prefix)
            elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'):  # continued stmt
                additional_prefix += prefix + line[start:]
                break
@@ -613,6 +636,39 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
    yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)


+def _split_illegal_unicode_name(token, start_pos, prefix):
+    def create_token():
+        return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix)
+
+    found = ''
+    is_illegal = False
+    pos = start_pos
+    for i, char in enumerate(token):
+        if is_illegal:
+            if is_identifier(char):
+                yield create_token()
+                found = char
+                is_illegal = False
+                prefix = ''
+                pos = start_pos[0], start_pos[1] + i
+            else:
+                found += char
+        else:
+            new_found = found + char
+            if is_identifier(new_found):
+                found = new_found
+            else:
+                if found:
+                    yield create_token()
+                    prefix = ''
+                    pos = start_pos[0], start_pos[1] + i
+                found = char
+                is_illegal = True
+
+    if found:
+        yield create_token()
+
+
 if __name__ == "__main__":
    if len(sys.argv) >= 2:
        path = sys.argv[1]
--- a/parso/python/tree.py
+++ b/parso/python/tree.py
@@ -43,7 +43,10 @@ Parser Tree Classes
 """

 import re
-from collections import Mapping
+try:
+    from collections.abc import Mapping
+except ImportError:
+    from collections import Mapping

 from parso._compatibility import utf8_repr, unicode
 from parso.tree import Node, BaseNode, Leaf, ErrorNode, ErrorLeaf, \
--- a/test/test_diff_parser.py
+++ b/test/test_diff_parser.py
@@ -974,10 +974,12 @@ def test_random_unicode_characters(differ):
    Those issues were all found with the fuzzer.
    """
    differ.initialize('')
-    differ.parse(u'\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1, expect_error_leaves=True)
+    differ.parse(u'\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1,
+                 expect_error_leaves=True)
    differ.parse(u'\r\r', parsers=1)
    differ.parse(u"˟Ę\x05À\r   rúƣ@\x8a\x15r()\n", parsers=1, expect_error_leaves=True)
-    differ.parse(u'a\ntaǁ\rGĒōns__\n\nb', parsers=1)
+    differ.parse(u'a\ntaǁ\rGĒōns__\n\nb', parsers=1,
+                 expect_error_leaves=sys.version_info[0] == 2)
    s = '        if not (self, "_fi\x02\x0e\x08\n\nle"):'
    differ.parse(s, parsers=1, expect_error_leaves=True)
    differ.parse('')
--- a/test/test_fstring.py
+++ b/test/test_fstring.py
@@ -12,33 +12,57 @@ def grammar():

@pytest.mark.parametrize(
    'code', [
-        '{1}',
-        '{1:}',
-        '',
-        '{1!a}',
-        '{1!a:1}',
-        '{1:1}',
-        '{1:1.{32}}',
-        '{1::>4}',
-        '{foo} {bar}',
-        '{x:{y}}',
-        '{x:{y:}}',
-        '{x:{y:1}}',
+        # simple cases
+        'f"{1}"',
+        'f"""{1}"""',
+        'f"{foo} {bar}"',
+
+        # empty string
+        'f""',
+        'f""""""',
+
+        # empty format specifier is okay
+        'f"{1:}"',
+
+        # use of conversion options
+        'f"{1!a}"',
+        'f"{1!a:1}"',
+
+        # format specifiers
+        'f"{1:1}"',
+        'f"{1:1.{32}}"',
+        'f"{1::>4}"',
+        'f"{x:{y}}"',
+        'f"{x:{y:}}"',
+        'f"{x:{y:1}}"',

        # Escapes
-        '{{}}',
-        '{{{1}}}',
-        '{{{1}',
-        '1{{2{{3',
-        '}}',
+        'f"{{}}"',
+        'f"{{{1}}}"',
+        'f"{{{1}"',
+        'f"1{{2{{3"',
+        'f"}}"',

        # New Python 3.8 syntax f'{a=}'
-        '{a=}',
-        '{a()=}',
+        'f"{a=}"',
+        'f"{a()=}"',
+
+        # multiline f-string
+        'f"""abc\ndef"""',
+        'f"""abc{\n123}def"""',
+
+        # a line continuation inside of an fstring_string
+        'f"abc\\\ndef"',
+        'f"\\\n{123}\\\n"',
+
+        # a line continuation inside of an fstring_expr
+        'f"{\\\n123}"',
+
+        # a line continuation inside of an format spec
+        'f"{123:.2\\\nf}"',
    ]
 )
 def test_valid(code, grammar):
-    code = 'f"""%s"""' % code
    module = grammar.parse(code, error_recovery=False)
    fstring = module.children[0]
    assert fstring.type == 'fstring'
@@ -47,23 +71,34 @@ def test_valid(code, grammar):

@pytest.mark.parametrize(
    'code', [
-        '}',
-        '{',
-        '{1!{a}}',
-        '{!{a}}',
-        '{}',
-        '{:}',
-        '{:}}}',
-        '{:1}',
-        '{!:}',
-        '{!}',
-        '{!a}',
-        '{1:{}}',
-        '{1:{:}}',
+        # an f-string can't contain unmatched curly braces
+        'f"}"',
+        'f"{"',
+        'f"""}"""',
+        'f"""{"""',
+
+        # invalid conversion characters
+        'f"{1!{a}}"',
+        'f"{!{a}}"',
+
+        # The curly braces must contain an expression
+        'f"{}"',
+        'f"{:}"',
+        'f"{:}}}"',
+        'f"{:1}"',
+        'f"{!:}"',
+        'f"{!}"',
+        'f"{!a}"',
+
+        # invalid (empty) format specifiers
+        'f"{1:{}}"',
+        'f"{1:{:}}"',
+
+        # a newline without a line continuation inside a single-line string
+        'f"abc\ndef"',
    ]
 )
 def test_invalid(code, grammar):
-    code = 'f"""%s"""' % code
    with pytest.raises(ParserSyntaxError):
        grammar.parse(code, error_recovery=False)

@@ -95,6 +130,7 @@ def test_tokenize_start_pos(code, positions):
            """),
        'f"foo',
        'f"""foo',
+        'f"abc\ndef"',
    ]
 )
 def test_roundtrip(grammar, code):
--- a/test/test_tokenize.py
+++ b/test/test_tokenize.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8    # This file contains Unicode characters.

+import sys
 from textwrap import dedent

 import pytest
@@ -16,6 +17,7 @@ from parso.python.tokenize import PythonToken
 NAME = PythonTokenTypes.NAME
 NEWLINE = PythonTokenTypes.NEWLINE
 STRING = PythonTokenTypes.STRING
+NUMBER = PythonTokenTypes.NUMBER
 INDENT = PythonTokenTypes.INDENT
 DEDENT = PythonTokenTypes.DEDENT
 ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
@@ -140,7 +142,7 @@ def test_identifier_contains_unicode():
    else:
        # Unicode tokens in Python 2 seem to be identified as operators.
        # They will be ignored in the parser, that's ok.
-        assert unicode_token[0] == OP
+        assert unicode_token[0] == ERRORTOKEN


 def test_quoted_strings():
@@ -228,16 +230,29 @@ def test_endmarker_end_pos():
    check('a\\')


+xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])
+
+
@pytest.mark.parametrize(
    ('code', 'types'), [
+        # Indentation
        (' foo', [INDENT, NAME, DEDENT]),
        ('  foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
        ('  foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
                                NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
        (' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
+
+        # Name stuff
+        ('1foo1', [NUMBER, NAME]),
+        pytest.param(
+            u'மெல்லினம்', [NAME],
+            **xfail_py2),
+        pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
+        pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
+        pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
    ]
 )
-def test_indentation(code, types):
+def test_token_types(code, types):
    actual_types = [t.type for t in _get_token_list(code)]
    assert actual_types == types + [ENDMARKER]

@@ -330,13 +345,46 @@ def test_backslash():
        ('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP]),
        (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
        (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
+
+        # format spec
        (r'f"Some {x:.2f}{y}"', [FSTRING_START, FSTRING_STRING, OP, NAME, OP,
                                 FSTRING_STRING, OP, OP, NAME, OP, FSTRING_END]),
+
+        # multiline f-string
+        ('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
+        ('f"""abc{\n123}def"""', [
+            FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
+            FSTRING_END
+        ]),
+
+        # a line continuation inside of an fstring_string
+        ('f"abc\\\ndef"', [
+            FSTRING_START, FSTRING_STRING, FSTRING_END
+        ]),
+        ('f"\\\n{123}\\\n"', [
+            FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
+            FSTRING_END
+        ]),
+
+        # a line continuation inside of an fstring_expr
+        ('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END]),
+
+        # a line continuation inside of an format spec
+        ('f"{123:.2\\\nf}"', [
+            FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END
+        ]),
+
+        # a newline without a line continuation inside a single-line string is
+        # wrong, and will generate an ERRORTOKEN
+        ('f"abc\ndef"', [
+            FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN
+        ]),
+
+        # a more complex example
        (r'print(f"Some {x:.2f}a{y}")', [
            NAME, OP, FSTRING_START, FSTRING_STRING, OP, NAME, OP,
            FSTRING_STRING, OP, FSTRING_STRING, OP, NAME, OP, FSTRING_END, OP
        ]),
-
    ]
 )
 def test_fstring(code, types, version_ge_py36):
--- a/tox.ini
+++ b/tox.ini
@@ -4,6 +4,7 @@ envlist = {py26,py27,py33,py34,py35,py36,py37}
 extras = testing
 deps =
    py26,py33: pytest>=3.0.7,<3.3
+    py27,py34: pytest<5
    py26,py33: setuptools<37
    coverage: coverage
 setenv =
Author	SHA1	Message	Date
Dave Halter	c0ace63a69	For Python 2.7 and 3.4 pytest 5 doesn't work anymore	2019-07-13 15:46:58 +02:00
Dave Halter	399e8e5043	Prepare the 0.5.1 release	2019-07-13 15:39:44 +02:00
Dave Halter	0a5b5f3346	Fix name tokenizing for Python 2	2019-07-13 15:34:23 +02:00
Dave Halter	2b8544021f	Fix positioning for names that are interleaved with error tokens	2019-07-13 12:34:49 +02:00
Dave Halter	99dd4a84d4	Merge branch 'master' of github.com:davidhalter/parso	2019-07-12 21:35:06 +02:00
Dave Halter	9501b0bde0	Fixed name tokenizing issues for tamil characters, fixes davidhalter/jedi#1368	2019-07-12 21:31:49 +02:00
Benjamin Woodruff	ad57a51800	Fix line continuation characters inside f-strings Line continuation characters are valid inside of strings, but weren't handled correctly in certain cases with f-strings, due to some small tokenizer bugs. This pull request to address those issues, and adds tests to validate the new logic.	2019-07-12 21:20:00 +02:00
Dave Halter	19de3eb5ca	Document that the cache uses pickle files	2019-07-10 00:17:28 -07:00
Dave Halter	7441e6b1d2	Fix changelog dates, fixes #77	2019-06-28 02:00:35 -07:00
Dave Halter	df3c494e02	Try to use collections.abc.Mapping instead of collections.Mapping The latter is deprecated and will be removed in Python 3.9, fixes #76	2019-06-21 10:17:18 +02:00