10 Commits

Author SHA1 Message Date
Dave Halter
c0ace63a69 For Python 2.7 and 3.4 pytest 5 doesn't work anymore 2019-07-13 15:46:58 +02:00
Dave Halter
399e8e5043 Prepare the 0.5.1 release 2019-07-13 15:39:44 +02:00
Dave Halter
0a5b5f3346 Fix name tokenizing for Python 2 2019-07-13 15:34:23 +02:00
Dave Halter
2b8544021f Fix positioning for names that are interleaved with error tokens 2019-07-13 12:34:49 +02:00
Dave Halter
99dd4a84d4 Merge branch 'master' of github.com:davidhalter/parso 2019-07-12 21:35:06 +02:00
Dave Halter
9501b0bde0 Fixed name tokenizing issues for tamil characters, fixes davidhalter/jedi#1368 2019-07-12 21:31:49 +02:00
Benjamin Woodruff
ad57a51800 Fix line continuation characters inside f-strings
Line continuation characters are valid inside of strings, but weren't
handled correctly in certain cases with f-strings, due to some small
tokenizer bugs.

This pull request to address those issues, and adds tests to validate
the new logic.
2019-07-12 21:20:00 +02:00
Dave Halter
19de3eb5ca Document that the cache uses pickle files 2019-07-10 00:17:28 -07:00
Dave Halter
7441e6b1d2 Fix changelog dates, fixes #77 2019-06-28 02:00:35 -07:00
Dave Halter
df3c494e02 Try to use collections.abc.Mapping instead of collections.Mapping
The latter is deprecated and will be removed in Python 3.9, fixes #76
2019-06-21 10:17:18 +02:00
9 changed files with 218 additions and 65 deletions

View File

@@ -3,6 +3,12 @@
Changelog
---------
0.5.1 (2019-07-13)
++++++++++++++++++
- Fix: Some unicode identifiers were not correctly tokenized
- Fix: Line continuations in f-strings are now working
0.5.0 (2019-06-20)
++++++++++++++++++
@@ -17,19 +23,19 @@ Changelog
- Python 3.8 support
- FileIO support, it's now possible to use abstract file IO, support is alpha
0.3.4 (2018-02-13)
0.3.4 (2019-02-13)
+++++++++++++++++++
- Fix an f-string tokenizer error
0.3.3 (2018-02-06)
0.3.3 (2019-02-06)
+++++++++++++++++++
- Fix async errors in the diff parser
- A fix in iter_errors
- This is a very small bugfix release
0.3.2 (2018-01-24)
0.3.2 (2019-01-24)
+++++++++++++++++++
- 20+ bugfixes in the diff parser and 3 in the tokenizer

View File

@@ -43,7 +43,7 @@ from parso.grammar import Grammar, load_grammar
from parso.utils import split_lines, python_bytes_to_unicode
__version__ = '0.5.0'
__version__ = '0.5.1'
def parse(code=None, **kwargs):

View File

@@ -57,7 +57,8 @@ class Grammar(object):
:param str path: The path to the file you want to open. Only needed for caching.
:param bool cache: Keeps a copy of the parser tree in RAM and on disk
if a path is given. Returns the cached trees if the corresponding
files on disk have not changed.
files on disk have not changed. Note that this stores pickle files
on your file system (e.g. for Linux in ``~/.cache/parso/``).
:param bool diff_cache: Diffs the cached python module against the new
code and tries to parse only the parts that have changed. Returns
the same (changed) module that is found in cache. Using this option

View File

@@ -23,6 +23,9 @@ from parso._compatibility import py_version
from parso.utils import split_lines
# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
MAX_UNICODE = '\U0010ffff'
STRING = PythonTokenTypes.STRING
NAME = PythonTokenTypes.NAME
NUMBER = PythonTokenTypes.NUMBER
@@ -51,8 +54,13 @@ if py_version >= 30:
# Python 3 has str.isidentifier() to check if a char is a valid identifier
is_identifier = str.isidentifier
else:
namechars = string.ascii_letters + '_'
is_identifier = lambda s: s in namechars
# Python 2 doesn't, but it's not that important anymore and if you tokenize
# Python 2 code with this, it's still ok. It's just that parsing Python 3
# code with this function is not 100% correct.
# This just means that Python 2 code matches a few identifiers too much,
# but that doesn't really matter.
def is_identifier(s):
return True
def group(*choices, **kwargs):
@@ -118,9 +126,9 @@ def _get_token_collection(version_info):
return result
fstring_string_single_line = _compile(r'(?:[^{}\r\n]+|\{\{|\}\})+')
fstring_string_single_line = _compile(r'(?:\{\{|\}\}|\\(?:\r\n?|\n)|[^{}\r\n])+')
fstring_string_multi_line = _compile(r'(?:[^{}]+|\{\{|\}\})+')
fstring_format_spec_single_line = _compile(r'[^{}\r\n]+')
fstring_format_spec_single_line = _compile(r'(?:\\(?:\r\n?|\n)|[^{}\r\n])+')
fstring_format_spec_multi_line = _compile(r'[^{}]+')
@@ -130,7 +138,16 @@ def _create_token_collection(version_info):
Whitespace = r'[ \f\t]*'
whitespace = _compile(Whitespace)
Comment = r'#[^\r\n]*'
Name = r'\w+'
# Python 2 is pretty much not working properly anymore, we just ignore
# parsing unicode properly, which is fine, I guess.
if version_info[0] == 2:
Name = r'([A-Za-z_0-9]+)'
elif sys.version_info[0] == 2:
# Unfortunately the regex engine cannot deal with the regex below, so
# just use this one.
Name = r'(\w+)'
else:
Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)'
if version_info >= (3, 6):
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
@@ -340,7 +357,9 @@ def _find_fstring_string(endpats, fstring_stack, line, lnum, pos):
new_pos = pos
new_pos += len(string)
if allow_multiline and (string.endswith('\n') or string.endswith('\r')):
# even if allow_multiline is False, we still need to check for trailing
# newlines, because a single-line f-string can contain line continuations
if string.endswith('\n') or string.endswith('\r'):
tos.previous_lines += string
string = ''
else:
@@ -510,6 +529,24 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
if (initial in numchars or # ordinary number
(initial == '.' and token != '.' and token != '...')):
yield PythonToken(NUMBER, token, spos, prefix)
elif pseudomatch.group(3) is not None: # ordinary name
if token in always_break_tokens:
fstring_stack[:] = []
paren_level = 0
# We only want to dedent if the token is on a new line.
if re.match(r'[ \f\t]*$', line[:start]):
while True:
indent = indents.pop()
if indent > start:
yield PythonToken(DEDENT, '', spos, '')
else:
indents.append(indent)
break
if is_identifier(token):
yield PythonToken(NAME, token, spos, prefix)
else:
for t in _split_illegal_unicode_name(token, spos, prefix):
yield t # yield from Python 2
elif initial in '\r\n':
if any(not f.allow_multiline() for f in fstring_stack):
# Would use fstring_stack.clear, but that's not available
@@ -564,20 +601,6 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
elif token in fstring_pattern_map: # The start of an fstring.
fstring_stack.append(FStringNode(fstring_pattern_map[token]))
yield PythonToken(FSTRING_START, token, spos, prefix)
elif is_identifier(initial): # ordinary name
if token in always_break_tokens:
fstring_stack[:] = []
paren_level = 0
# We only want to dedent if the token is on a new line.
if re.match(r'[ \f\t]*$', line[:start]):
while True:
indent = indents.pop()
if indent > start:
yield PythonToken(DEDENT, '', spos, '')
else:
indents.append(indent)
break
yield PythonToken(NAME, token, spos, prefix)
elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'): # continued stmt
additional_prefix += prefix + line[start:]
break
@@ -613,6 +636,39 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)
def _split_illegal_unicode_name(token, start_pos, prefix):
def create_token():
return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix)
found = ''
is_illegal = False
pos = start_pos
for i, char in enumerate(token):
if is_illegal:
if is_identifier(char):
yield create_token()
found = char
is_illegal = False
prefix = ''
pos = start_pos[0], start_pos[1] + i
else:
found += char
else:
new_found = found + char
if is_identifier(new_found):
found = new_found
else:
if found:
yield create_token()
prefix = ''
pos = start_pos[0], start_pos[1] + i
found = char
is_illegal = True
if found:
yield create_token()
if __name__ == "__main__":
if len(sys.argv) >= 2:
path = sys.argv[1]

View File

@@ -43,7 +43,10 @@ Parser Tree Classes
"""
import re
from collections import Mapping
try:
from collections.abc import Mapping
except ImportError:
from collections import Mapping
from parso._compatibility import utf8_repr, unicode
from parso.tree import Node, BaseNode, Leaf, ErrorNode, ErrorLeaf, \

View File

@@ -974,10 +974,12 @@ def test_random_unicode_characters(differ):
Those issues were all found with the fuzzer.
"""
differ.initialize('')
differ.parse(u'\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1, expect_error_leaves=True)
differ.parse(u'\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1,
expect_error_leaves=True)
differ.parse(u'\r\r', parsers=1)
differ.parse(u"˟Ę\x05À\r rúƣ@\x8a\x15r()\n", parsers=1, expect_error_leaves=True)
differ.parse(u'a\ntaǁ\rGĒōns__\n\nb', parsers=1)
differ.parse(u'a\ntaǁ\rGĒōns__\n\nb', parsers=1,
expect_error_leaves=sys.version_info[0] == 2)
s = ' if not (self, "_fi\x02\x0e\x08\n\nle"):'
differ.parse(s, parsers=1, expect_error_leaves=True)
differ.parse('')

View File

@@ -12,33 +12,57 @@ def grammar():
@pytest.mark.parametrize(
'code', [
'{1}',
'{1:}',
'',
'{1!a}',
'{1!a:1}',
'{1:1}',
'{1:1.{32}}',
'{1::>4}',
'{foo} {bar}',
'{x:{y}}',
'{x:{y:}}',
'{x:{y:1}}',
# simple cases
'f"{1}"',
'f"""{1}"""',
'f"{foo} {bar}"',
# empty string
'f""',
'f""""""',
# empty format specifier is okay
'f"{1:}"',
# use of conversion options
'f"{1!a}"',
'f"{1!a:1}"',
# format specifiers
'f"{1:1}"',
'f"{1:1.{32}}"',
'f"{1::>4}"',
'f"{x:{y}}"',
'f"{x:{y:}}"',
'f"{x:{y:1}}"',
# Escapes
'{{}}',
'{{{1}}}',
'{{{1}',
'1{{2{{3',
'}}',
'f"{{}}"',
'f"{{{1}}}"',
'f"{{{1}"',
'f"1{{2{{3"',
'f"}}"',
# New Python 3.8 syntax f'{a=}'
'{a=}',
'{a()=}',
'f"{a=}"',
'f"{a()=}"',
# multiline f-string
'f"""abc\ndef"""',
'f"""abc{\n123}def"""',
# a line continuation inside of an fstring_string
'f"abc\\\ndef"',
'f"\\\n{123}\\\n"',
# a line continuation inside of an fstring_expr
'f"{\\\n123}"',
# a line continuation inside of an format spec
'f"{123:.2\\\nf}"',
]
)
def test_valid(code, grammar):
code = 'f"""%s"""' % code
module = grammar.parse(code, error_recovery=False)
fstring = module.children[0]
assert fstring.type == 'fstring'
@@ -47,23 +71,34 @@ def test_valid(code, grammar):
@pytest.mark.parametrize(
'code', [
'}',
'{',
'{1!{a}}',
'{!{a}}',
'{}',
'{:}',
'{:}}}',
'{:1}',
'{!:}',
'{!}',
'{!a}',
'{1:{}}',
'{1:{:}}',
# an f-string can't contain unmatched curly braces
'f"}"',
'f"{"',
'f"""}"""',
'f"""{"""',
# invalid conversion characters
'f"{1!{a}}"',
'f"{!{a}}"',
# The curly braces must contain an expression
'f"{}"',
'f"{:}"',
'f"{:}}}"',
'f"{:1}"',
'f"{!:}"',
'f"{!}"',
'f"{!a}"',
# invalid (empty) format specifiers
'f"{1:{}}"',
'f"{1:{:}}"',
# a newline without a line continuation inside a single-line string
'f"abc\ndef"',
]
)
def test_invalid(code, grammar):
code = 'f"""%s"""' % code
with pytest.raises(ParserSyntaxError):
grammar.parse(code, error_recovery=False)
@@ -95,6 +130,7 @@ def test_tokenize_start_pos(code, positions):
"""),
'f"foo',
'f"""foo',
'f"abc\ndef"',
]
)
def test_roundtrip(grammar, code):

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 # This file contains Unicode characters.
import sys
from textwrap import dedent
import pytest
@@ -16,6 +17,7 @@ from parso.python.tokenize import PythonToken
NAME = PythonTokenTypes.NAME
NEWLINE = PythonTokenTypes.NEWLINE
STRING = PythonTokenTypes.STRING
NUMBER = PythonTokenTypes.NUMBER
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
@@ -140,7 +142,7 @@ def test_identifier_contains_unicode():
else:
# Unicode tokens in Python 2 seem to be identified as operators.
# They will be ignored in the parser, that's ok.
assert unicode_token[0] == OP
assert unicode_token[0] == ERRORTOKEN
def test_quoted_strings():
@@ -228,16 +230,29 @@ def test_endmarker_end_pos():
check('a\\')
xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])
@pytest.mark.parametrize(
('code', 'types'), [
# Indentation
(' foo', [INDENT, NAME, DEDENT]),
(' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
(' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
(' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
# Name stuff
('1foo1', [NUMBER, NAME]),
pytest.param(
u'மெல்லினம்', [NAME],
**xfail_py2),
pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
]
)
def test_indentation(code, types):
def test_token_types(code, types):
actual_types = [t.type for t in _get_token_list(code)]
assert actual_types == types + [ENDMARKER]
@@ -330,13 +345,46 @@ def test_backslash():
('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP]),
(r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
(r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
# format spec
(r'f"Some {x:.2f}{y}"', [FSTRING_START, FSTRING_STRING, OP, NAME, OP,
FSTRING_STRING, OP, OP, NAME, OP, FSTRING_END]),
# multiline f-string
('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
('f"""abc{\n123}def"""', [
FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
FSTRING_END
]),
# a line continuation inside of an fstring_string
('f"abc\\\ndef"', [
FSTRING_START, FSTRING_STRING, FSTRING_END
]),
('f"\\\n{123}\\\n"', [
FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
FSTRING_END
]),
# a line continuation inside of an fstring_expr
('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END]),
# a line continuation inside of an format spec
('f"{123:.2\\\nf}"', [
FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END
]),
# a newline without a line continuation inside a single-line string is
# wrong, and will generate an ERRORTOKEN
('f"abc\ndef"', [
FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN
]),
# a more complex example
(r'print(f"Some {x:.2f}a{y}")', [
NAME, OP, FSTRING_START, FSTRING_STRING, OP, NAME, OP,
FSTRING_STRING, OP, FSTRING_STRING, OP, NAME, OP, FSTRING_END, OP
]),
]
)
def test_fstring(code, types, version_ge_py36):

View File

@@ -4,6 +4,7 @@ envlist = {py26,py27,py33,py34,py35,py36,py37}
extras = testing
deps =
py26,py33: pytest>=3.0.7,<3.3
py27,py34: pytest<5
py26,py33: setuptools<37
coverage: coverage
setenv =