From 734a4b0e674ad87d14d5b3593fb0abda2e0260c2 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Sat, 4 Apr 2020 15:38:10 +0200 Subject: [PATCH] Remove support for specialized treatment of form feeds This is a very intentional change. Previously form feeds were handled very poorly and sometimes where not counted as indentation. This obviously makes sense. But at the same time indentation is very tricky to deal with (both for editors and parso). Especially in the diff parser this led to a lot of very weird issues. The decision probably makes sense since: 1. Almost nobody uses form feeds in the first place. 2. People that use form feeds like Barry Warsaw often put a newline ater them. (e.g Python's email.__init__) 3. If you write an editor you want to be able to identify a unicode character with a clear line/column. This would not be the case if form feeds were just ignored when counting. Form feeds will still work in Jedi, will not cause parse errors and in general you should be fine using them. It might just cause Jedi to count them as indentation **if** you use it like '\f foo()'. This is however confusing for most editors anyway. It leads to a weird display e.g. in VIM, even if it's perfectly valid code in Python. Since parso is a code analysis parser and not the languages parser I think it's fine to ignore this edge case. --- parso/python/diff.py | 3 ++- parso/python/tokenize.py | 5 ----- test/test_diff_parser.py | 21 +++++++++++++++++++++ test/test_pgen2.py | 18 +++++++++++------- test/test_tokenize.py | 4 +++- 5 files changed, 37 insertions(+), 14 deletions(-) diff --git a/parso/python/diff.py b/parso/python/diff.py index d69d0c5..ae2832c 100644 --- a/parso/python/diff.py +++ b/parso/python/diff.py @@ -602,7 +602,8 @@ class _NodesTree(object): is_endmarker = last_leaf.type == 'endmarker' self._prefix_remainder = '' if is_endmarker: - separation = max(last_leaf.prefix.rfind('\n'), last_leaf.prefix.rfind('\r')) + prefix = last_leaf.prefix + separation = max(prefix.rfind('\n'), prefix.rfind('\r')) if separation > -1: # Remove the whitespace part of the prefix after a newline. # That is not relevant if parentheses were opened. Always parse diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index ebb2c43..f490bb0 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -522,12 +522,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0), indents=None): if new_line and initial not in '\r\n#' and (initial != '\\' or pseudomatch is None): new_line = False if paren_level == 0 and not fstring_stack: - i = 0 indent_start = start - while line[i] == '\f': - i += 1 - # TODO don't we need to change spos as well? - indent_start -= 1 if indent_start > indents[-1]: yield PythonToken(INDENT, '', spos, '') indents.append(indent_start) diff --git a/test/test_diff_parser.py b/test/test_diff_parser.py index e1cb51e..ae65980 100644 --- a/test/test_diff_parser.py +++ b/test/test_diff_parser.py @@ -1406,3 +1406,24 @@ def test_error_dedent_in_function(differ): ''') differ.initialize(code1) differ.parse(code2, parsers=ANY, copies=ANY, expect_error_leaves=True) + + +def test_x(differ): + code1 = dedent('''\ + @bla + async def foo(): + 1 + yield from [] + return + return '' + ''') + code2 = dedent('''\ + @bla + async def foo(): + 1 + \x0cimport + return + return '' + ''') + differ.initialize(code1) + differ.parse(code2, parsers=ANY, copies=ANY, expect_error_leaves=True) diff --git a/test/test_pgen2.py b/test/test_pgen2.py index 30b0c23..4a80922 100644 --- a/test/test_pgen2.py +++ b/test/test_pgen2.py @@ -29,13 +29,17 @@ def _invalid_syntax(code, version=None, **kwargs): print(module.children) -def test_formfeed(each_py2_version): - s = u"""print 1\n\x0Cprint 2\n""" - t = _parse(s, each_py2_version) - assert t.children[0].children[0].type == 'print_stmt' - assert t.children[1].children[0].type == 'print_stmt' - s = u"""1\n\x0C\x0C2\n""" - t = _parse(s, each_py2_version) +def test_formfeed(each_version): + s = u"foo\n\x0c\nfoo\n" + t = _parse(s, each_version) + assert t.children[0].children[0].type == 'name' + assert t.children[1].children[0].type == 'name' + s = u"1\n\x0c\x0c\n2\n" + t = _parse(s, each_version) + + with pytest.raises(ParserSyntaxError): + s = u"\n\x0c2\n" + _parse(s, each_version) def test_matrix_multiplication_operator(works_ge_py35): diff --git a/test/test_tokenize.py b/test/test_tokenize.py index ff396a6..e8500ca 100644 --- a/test/test_tokenize.py +++ b/test/test_tokenize.py @@ -332,11 +332,13 @@ def test_brackets_no_indentation(): def test_form_feed(): - error_token, endmarker = _get_token_list(dedent('''\ + indent, error_token, dedent_, endmarker = _get_token_list(dedent('''\ \f"""''')) assert error_token.prefix == '\f' assert error_token.string == '"""' assert endmarker.prefix == '' + assert indent.type == INDENT + assert dedent_.type == DEDENT def test_carriage_return():