Remove the remove_newline hack and build that stuff into error recovery instead.

Tests are passing except for diff parser tests.
2025-12-06 21:04:29 +08:00 · 2017-05-31 21:24:24 +02:00
parent b367058af6
commit 814b16cc6c
10 changed files with 73 additions and 96 deletions
--- a/parso/init.py
+++ b/parso/init.py
@@ -5,16 +5,18 @@ versions, file caching, round-trips and other stuff:
 >>> from parso import load_grammar
 >>> grammar = load_grammar(version='2.7')
 >>> module = grammar.parse('hello + 1')
->>> stmt = module.children[0]
->>> stmt
-PythonNode(simple_stmt, [PythonNode(arith_expr, [...]), <Newline: ''>])
->>> stmt.get_code()
+>>> expr = module.children[0]
+>>> expr
+PythonNode(arith_expr, [<Name: hello@1,0>, <Operator: +>, <Number: 1>])
+>>> expr.get_code()
 'hello + 1'
->>> name = stmt.children[0].children[0]
+>>> name = expr.children[0]
 >>> name
 <Name: hello@1,0>
 >>> name.end_pos
 (1, 5)
+>>> expr.end_pos
+(1, 9)
 """

 from parso.parser import ParserSyntaxError
--- a/parso/grammar.py
+++ b/parso/grammar.py
@@ -6,7 +6,6 @@ import re
 from parso._compatibility import FileNotFoundError, unicode
 from parso.pgen2.pgen import generate_grammar
 from parso.utils import splitlines, source_to_unicode
-from parso.python.parser import remove_last_newline
 from parso.python.diff import DiffParser
 from parso.tokenize import tokenize_lines
 from parso.cache import parser_cache, load_module, save_module
@@ -85,7 +84,7 @@ class Grammar(object):
            with open(path, 'rb') as f:
                code = source_to_unicode(f.read())

-        lines = tokenize_lines = splitlines(code, keepends=True)
+        lines = splitlines(code, keepends=True)
        if diff_cache:
            if self._diff_parser is None:
                raise TypeError("You have to define a diff parser to be able "
@@ -108,19 +107,10 @@ class Grammar(object):
                            cache_path=cache_path)
                return new_node

-        added_newline = not code.endswith('\n')
-        if added_newline:
-            code += '\n'
-            tokenize_lines = list(tokenize_lines)
-            tokenize_lines[-1] += '\n'
-            tokenize_lines.append('')
-
-        tokens = self._tokenizer(tokenize_lines)
+        tokens = self._tokenizer(lines)

        p = self._parser(self._pgen_grammar, error_recovery=error_recovery, start_symbol=start_symbol)
        root_node = p.parse(tokens=tokens)
-        if added_newline:
-            remove_last_newline(root_node)

        if cache or diff_cache:
            save_module(self._hashed, path, root_node, lines, pickling=cache,
--- a/parso/normalizer.py
+++ b/parso/normalizer.py
@@ -6,8 +6,8 @@ class Normalizer(object):

        >>> normalizer = Normalizer()
        >>> @normalizer.register_rule
-        >>> class MyRule(Rule):
-        >>>     error_code = 42
+        ... class MyRule(Rule):
+        ...     error_code = 42
        """
        try:
            rules = cls.rules
--- a/parso/python/diff.py
+++ b/parso/python/diff.py
@@ -11,7 +11,7 @@ from collections import namedtuple
 import logging

 from parso.utils import splitlines
-from parso.python.parser import Parser, remove_last_newline
+from parso.python.parser import Parser
 from parso.python.tree import EndMarker
 from parso.tokenize import (tokenize_lines, NEWLINE, TokenInfo,
                            ENDMARKER, INDENT, DEDENT)
@@ -120,14 +120,6 @@ class DiffParser(object):
        self._module._used_names = None

        self._parser_lines_new = new_lines
-        self._added_newline = False
-        if new_lines[-1] != '':
-            # The Python grammar needs a newline at the end of a file, but for
-            # everything else we keep working with new_lines here.
-            self._parser_lines_new = list(new_lines)
-            self._parser_lines_new[-1] += '\n'
-            self._parser_lines_new.append('')
-            self._added_newline = True

        self._reset()

@@ -141,7 +133,7 @@ class DiffParser(object):
            logging.debug('diff %s old[%s:%s] new[%s:%s]',
                      operation, i1 + 1, i2, j1 + 1, j2)

-            if j2 == line_length + int(self._added_newline):
+            if j2 == line_length:
                # The empty part after the last newline is not relevant.
                j2 -= 1

@@ -159,9 +151,6 @@ class DiffParser(object):
        # changed module.
        self._nodes_stack.close()

-        if self._added_newline:
-            remove_last_newline(self._module)
-
        last_pos = self._module.end_pos[0]
        if last_pos != line_length:
            current_lines = splitlines(self._module.get_code(), keepends=True)
--- a/parso/python/parser.py
+++ b/parso/python/parser.py
@@ -3,7 +3,7 @@ from parso import tokenize
 from parso.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER,
                               STRING, tok_name)
 from parso.parser import BaseParser
-from parso.utils import splitlines
+from parso.pgen2.parse import token_to_ilabel


 class Parser(BaseParser):
@@ -128,6 +128,51 @@ class Parser(BaseParser):
        allows using different grammars (even non-Python). However, error
        recovery is purely written for Python.
        """
+        def get_symbol_and_nodes(stack):
+            for dfa, state, (type_, nodes) in stack:
+                symbol = pgen_grammar.number2symbol[type_]
+                yield symbol, nodes
+
+        if typ == ENDMARKER:
+            def reduce_stack(states, newstate):
+                # reduce
+                state = newstate
+                while states[state] == [(0, state)]:
+                    self.pgen_parser._pop()
+
+                    dfa, state, (type_, nodes) = stack[-1]
+                    states, first = dfa
+
+
+            # In Python statements need to end with a newline. But since it's
+            # possible (and valid in Python ) that there's no newline at the
+            # end of a file, we have to recover even if the user doesn't want
+            # error recovery.
+            #print('x', pprint.pprint(stack))
+            ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value)
+
+            dfa, state, (type_, nodes) = stack[-1]
+            symbol = pgen_grammar.number2symbol[type_]
+            states, first = dfa
+            arcs = states[state]
+            # Look for a state with this label
+            for i, newstate in arcs:
+                if ilabel == i:
+                    if symbol == 'simple_stmt':
+                        # This is basically shifting
+                        stack[-1] = (dfa, newstate, (type_, nodes))
+
+                        reduce_stack(states, newstate)
+                        add_token_callback(typ, value, start_pos, prefix)
+                        return
+                    # Check if we're at the right point
+                    #for symbol, nodes in get_symbol_and_nodes(stack):
+                    #        self.pgen_parser._pop()
+
+                            #break
+                    break
+            #symbol = pgen_grammar.number2symbol[type_]
+
        if not self._error_recovery:
            return super(Parser, self).error_recovery(
                pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
@@ -136,9 +181,8 @@ class Parser(BaseParser):
        def current_suite(stack):
            # For now just discard everything that is not a suite or
            # file_input, if we detect an error.
-            for index, (dfa, state, (type_, nodes)) in reversed(list(enumerate(stack))):
+            for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))):
                # `suite` can sometimes be only simple_stmt, not stmt.
-                symbol = pgen_grammar.number2symbol[type_]
                if symbol == 'file_input':
                    break
                elif symbol == 'suite' and len(nodes) > 1:
@@ -191,58 +235,4 @@ class Parser(BaseParser):
                self._indent_counter -= 1
            elif typ == INDENT:
                self._indent_counter += 1
-
            yield typ, value, start_pos, prefix
-
-
-def remove_last_newline(node):
-    def calculate_end_pos(leaf, text):
-        if leaf is None:
-            end_pos = (1, 0)
-        else:
-            end_pos = leaf.end_pos
-
-        lines = splitlines(text, keepends=True)
-        if len(lines) == 1:
-            return end_pos[0], end_pos[1] + len(lines[0])
-        else:
-            return end_pos[0] + len(lines) - 1,  len(lines[-1])
-
-    endmarker = node.children[-1]
-    # The newline is either in the endmarker as a prefix or the previous
-    # leaf as a newline token.
-    prefix = endmarker.prefix
-    leaf = endmarker.get_previous_leaf()
-    if prefix:
-        text = prefix
-    else:
-        if leaf is None:
-            raise ValueError("You're trying to remove a newline from an empty module.")
-
-        text = leaf.value
-
-    if not text.endswith('\n'):
-        raise ValueError("There's no newline at the end, cannot remove it.")
-
-    text = text[:-1]
-    if text and text[-1] == '\r':
-        # By adding an artificial newline this creates weird side effects for
-        # \r at the end of files that would normally be error leafs. Try to
-        # correct that here.
-        text = text[:-1]
-        start_pos = calculate_end_pos(leaf, text)
-        error_token = tree.PythonErrorLeaf('errortoken', '\r', start_pos, prefix=text)
-        node.children.insert(-2, error_token)
-
-        # Cleanup
-        leaf = error_token
-        text = ''
-
-    if prefix:
-        endmarker.prefix = text
-
-
-        endmarker.start_pos = calculate_end_pos(leaf, text)
-    else:
-        leaf.value = text
-        endmarker.start_pos = leaf.end_pos
--- a/parso/python/prefix.py
+++ b/parso/python/prefix.py
@@ -17,7 +17,7 @@ class PrefixPart(object):


 _comment = r'#[^\n\r\f]*'
-_backslash = r'\\\r?\n?'
+_backslash = r'\\\r?\n'
 _whitespace = r' +'
 _tabs = r'\t+'
 _newline = r'\r?\n'
--- a/parso/tokenize.py
+++ b/parso/tokenize.py
@@ -251,7 +251,9 @@ def tokenize_lines(lines):
                txt = line[pos:]
                if txt.endswith('\n'):
                    new_line = True
-                yield TokenInfo(ERRORTOKEN, txt, (lnum, pos), prefix)
+                # TODO remove prefix?
+                yield TokenInfo(ERRORTOKEN, txt, (lnum, pos), additional_prefix)
+                additional_prefix = ''
                break

            prefix = additional_prefix + pseudomatch.group(1)
@@ -259,6 +261,12 @@ def tokenize_lines(lines):
            start, pos = pseudomatch.span(2)
            spos = (lnum, start)
            token = pseudomatch.group(2)
+            if token == '':
+                assert prefix
+                additional_prefix = prefix
+                # This means that we have a line with whitespace/comments at
+                # the end, which just results in an endmarker.
+                break
            initial = token[0]

            if new_line and initial not in '\r\n#':
--- a/test/test_parser.py
+++ b/test/test_parser.py
@@ -81,7 +81,7 @@ def test_incomplete_list_comprehension():
    # parser only valid statements generate one.
    children = parse('(1 for def').children
    assert [c.type for c in children] == \
-        ['error_node', 'error_node', 'newline', 'endmarker']
+        ['error_node', 'error_node', 'endmarker']


 def test_newline_positions():
@@ -153,7 +153,7 @@ def test_python2_octal():
 def test_python3_octal():
    module = parse('0o660')
    if py_version >= 30:
-        assert module.children[0].children[0].type == 'number'
+        assert module.children[0].type == 'number'
    else:
        assert module.children[0].type == 'error_node'

--- a/test/test_parser_tree.py
+++ b/test/test_parser_tree.py
@@ -63,7 +63,7 @@ class TestsFunctionAndLambdaParsing(object):

 def test_end_pos_line():
    # jedi issue #150
-    s = "x()\nx( )\nx(  )\nx (  )"
+    s = "x()\nx( )\nx(  )\nx (  )\n"
    module = parse(s)
    for i, simple_stmt in enumerate(module.children[:-1]):
        expr_stmt = simple_stmt.children[0]
--- a/test/test_prefix.py
+++ b/test/test_prefix.py
@@ -12,7 +12,6 @@ import parso
    (' \f ', [' ', '\f', ' ']),
    (' \f ', [' ', '\f', ' ']),
    (' \r\n', [' ', '\r\n']),
-    ('\\', ['\\']),
    ('\\\n', ['\\\n']),
    ('\\\r\n', ['\\\r\n']),
    ('\t\t\n\t', ['\t\t', '\n', '\t']),
@@ -43,7 +42,6 @@ def test_simple_prefix_splitting(string, tokens):
    ('\r\n', ['newline']),
    ('\f', ['formfeed']),
    ('\\\n', ['backslash']),
-    ('\r', ['newline']),
 ])
 def test_prefix_splitting_types(string, types):
    tree = parso.parse(string)