diff --git a/parso/__init__.py b/parso/__init__.py index b1bf888..a2cb4f2 100644 --- a/parso/__init__.py +++ b/parso/__init__.py @@ -5,16 +5,18 @@ versions, file caching, round-trips and other stuff: >>> from parso import load_grammar >>> grammar = load_grammar(version='2.7') >>> module = grammar.parse('hello + 1') ->>> stmt = module.children[0] ->>> stmt -PythonNode(simple_stmt, [PythonNode(arith_expr, [...]), ]) ->>> stmt.get_code() +>>> expr = module.children[0] +>>> expr +PythonNode(arith_expr, [, , ]) +>>> expr.get_code() 'hello + 1' ->>> name = stmt.children[0].children[0] +>>> name = expr.children[0] >>> name >>> name.end_pos (1, 5) +>>> expr.end_pos +(1, 9) """ from parso.parser import ParserSyntaxError diff --git a/parso/grammar.py b/parso/grammar.py index b48ae24..10689f0 100644 --- a/parso/grammar.py +++ b/parso/grammar.py @@ -6,7 +6,6 @@ import re from parso._compatibility import FileNotFoundError, unicode from parso.pgen2.pgen import generate_grammar from parso.utils import splitlines, source_to_unicode -from parso.python.parser import remove_last_newline from parso.python.diff import DiffParser from parso.tokenize import tokenize_lines from parso.cache import parser_cache, load_module, save_module @@ -85,7 +84,7 @@ class Grammar(object): with open(path, 'rb') as f: code = source_to_unicode(f.read()) - lines = tokenize_lines = splitlines(code, keepends=True) + lines = splitlines(code, keepends=True) if diff_cache: if self._diff_parser is None: raise TypeError("You have to define a diff parser to be able " @@ -108,19 +107,10 @@ class Grammar(object): cache_path=cache_path) return new_node - added_newline = not code.endswith('\n') - if added_newline: - code += '\n' - tokenize_lines = list(tokenize_lines) - tokenize_lines[-1] += '\n' - tokenize_lines.append('') - - tokens = self._tokenizer(tokenize_lines) + tokens = self._tokenizer(lines) p = self._parser(self._pgen_grammar, error_recovery=error_recovery, start_symbol=start_symbol) root_node = p.parse(tokens=tokens) - if added_newline: - remove_last_newline(root_node) if cache or diff_cache: save_module(self._hashed, path, root_node, lines, pickling=cache, diff --git a/parso/normalizer.py b/parso/normalizer.py index c5d843f..f26669f 100644 --- a/parso/normalizer.py +++ b/parso/normalizer.py @@ -6,8 +6,8 @@ class Normalizer(object): >>> normalizer = Normalizer() >>> @normalizer.register_rule - >>> class MyRule(Rule): - >>> error_code = 42 + ... class MyRule(Rule): + ... error_code = 42 """ try: rules = cls.rules diff --git a/parso/python/diff.py b/parso/python/diff.py index 1bbfe34..fa2cd92 100644 --- a/parso/python/diff.py +++ b/parso/python/diff.py @@ -11,7 +11,7 @@ from collections import namedtuple import logging from parso.utils import splitlines -from parso.python.parser import Parser, remove_last_newline +from parso.python.parser import Parser from parso.python.tree import EndMarker from parso.tokenize import (tokenize_lines, NEWLINE, TokenInfo, ENDMARKER, INDENT, DEDENT) @@ -120,14 +120,6 @@ class DiffParser(object): self._module._used_names = None self._parser_lines_new = new_lines - self._added_newline = False - if new_lines[-1] != '': - # The Python grammar needs a newline at the end of a file, but for - # everything else we keep working with new_lines here. - self._parser_lines_new = list(new_lines) - self._parser_lines_new[-1] += '\n' - self._parser_lines_new.append('') - self._added_newline = True self._reset() @@ -141,7 +133,7 @@ class DiffParser(object): logging.debug('diff %s old[%s:%s] new[%s:%s]', operation, i1 + 1, i2, j1 + 1, j2) - if j2 == line_length + int(self._added_newline): + if j2 == line_length: # The empty part after the last newline is not relevant. j2 -= 1 @@ -159,9 +151,6 @@ class DiffParser(object): # changed module. self._nodes_stack.close() - if self._added_newline: - remove_last_newline(self._module) - last_pos = self._module.end_pos[0] if last_pos != line_length: current_lines = splitlines(self._module.get_code(), keepends=True) diff --git a/parso/python/parser.py b/parso/python/parser.py index 4300a8a..c1af961 100644 --- a/parso/python/parser.py +++ b/parso/python/parser.py @@ -3,7 +3,7 @@ from parso import tokenize from parso.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER, STRING, tok_name) from parso.parser import BaseParser -from parso.utils import splitlines +from parso.pgen2.parse import token_to_ilabel class Parser(BaseParser): @@ -128,6 +128,51 @@ class Parser(BaseParser): allows using different grammars (even non-Python). However, error recovery is purely written for Python. """ + def get_symbol_and_nodes(stack): + for dfa, state, (type_, nodes) in stack: + symbol = pgen_grammar.number2symbol[type_] + yield symbol, nodes + + if typ == ENDMARKER: + def reduce_stack(states, newstate): + # reduce + state = newstate + while states[state] == [(0, state)]: + self.pgen_parser._pop() + + dfa, state, (type_, nodes) = stack[-1] + states, first = dfa + + + # In Python statements need to end with a newline. But since it's + # possible (and valid in Python ) that there's no newline at the + # end of a file, we have to recover even if the user doesn't want + # error recovery. + #print('x', pprint.pprint(stack)) + ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value) + + dfa, state, (type_, nodes) = stack[-1] + symbol = pgen_grammar.number2symbol[type_] + states, first = dfa + arcs = states[state] + # Look for a state with this label + for i, newstate in arcs: + if ilabel == i: + if symbol == 'simple_stmt': + # This is basically shifting + stack[-1] = (dfa, newstate, (type_, nodes)) + + reduce_stack(states, newstate) + add_token_callback(typ, value, start_pos, prefix) + return + # Check if we're at the right point + #for symbol, nodes in get_symbol_and_nodes(stack): + # self.pgen_parser._pop() + + #break + break + #symbol = pgen_grammar.number2symbol[type_] + if not self._error_recovery: return super(Parser, self).error_recovery( pgen_grammar, stack, arcs, typ, value, start_pos, prefix, @@ -136,9 +181,8 @@ class Parser(BaseParser): def current_suite(stack): # For now just discard everything that is not a suite or # file_input, if we detect an error. - for index, (dfa, state, (type_, nodes)) in reversed(list(enumerate(stack))): + for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))): # `suite` can sometimes be only simple_stmt, not stmt. - symbol = pgen_grammar.number2symbol[type_] if symbol == 'file_input': break elif symbol == 'suite' and len(nodes) > 1: @@ -191,58 +235,4 @@ class Parser(BaseParser): self._indent_counter -= 1 elif typ == INDENT: self._indent_counter += 1 - yield typ, value, start_pos, prefix - - -def remove_last_newline(node): - def calculate_end_pos(leaf, text): - if leaf is None: - end_pos = (1, 0) - else: - end_pos = leaf.end_pos - - lines = splitlines(text, keepends=True) - if len(lines) == 1: - return end_pos[0], end_pos[1] + len(lines[0]) - else: - return end_pos[0] + len(lines) - 1, len(lines[-1]) - - endmarker = node.children[-1] - # The newline is either in the endmarker as a prefix or the previous - # leaf as a newline token. - prefix = endmarker.prefix - leaf = endmarker.get_previous_leaf() - if prefix: - text = prefix - else: - if leaf is None: - raise ValueError("You're trying to remove a newline from an empty module.") - - text = leaf.value - - if not text.endswith('\n'): - raise ValueError("There's no newline at the end, cannot remove it.") - - text = text[:-1] - if text and text[-1] == '\r': - # By adding an artificial newline this creates weird side effects for - # \r at the end of files that would normally be error leafs. Try to - # correct that here. - text = text[:-1] - start_pos = calculate_end_pos(leaf, text) - error_token = tree.PythonErrorLeaf('errortoken', '\r', start_pos, prefix=text) - node.children.insert(-2, error_token) - - # Cleanup - leaf = error_token - text = '' - - if prefix: - endmarker.prefix = text - - - endmarker.start_pos = calculate_end_pos(leaf, text) - else: - leaf.value = text - endmarker.start_pos = leaf.end_pos diff --git a/parso/python/prefix.py b/parso/python/prefix.py index 06bbf53..ec851b7 100644 --- a/parso/python/prefix.py +++ b/parso/python/prefix.py @@ -17,7 +17,7 @@ class PrefixPart(object): _comment = r'#[^\n\r\f]*' -_backslash = r'\\\r?\n?' +_backslash = r'\\\r?\n' _whitespace = r' +' _tabs = r'\t+' _newline = r'\r?\n' diff --git a/parso/tokenize.py b/parso/tokenize.py index 6135931..6c5bbc9 100644 --- a/parso/tokenize.py +++ b/parso/tokenize.py @@ -251,7 +251,9 @@ def tokenize_lines(lines): txt = line[pos:] if txt.endswith('\n'): new_line = True - yield TokenInfo(ERRORTOKEN, txt, (lnum, pos), prefix) + # TODO remove prefix? + yield TokenInfo(ERRORTOKEN, txt, (lnum, pos), additional_prefix) + additional_prefix = '' break prefix = additional_prefix + pseudomatch.group(1) @@ -259,6 +261,12 @@ def tokenize_lines(lines): start, pos = pseudomatch.span(2) spos = (lnum, start) token = pseudomatch.group(2) + if token == '': + assert prefix + additional_prefix = prefix + # This means that we have a line with whitespace/comments at + # the end, which just results in an endmarker. + break initial = token[0] if new_line and initial not in '\r\n#': diff --git a/test/test_parser.py b/test/test_parser.py index ee1becc..cc83ee5 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -81,7 +81,7 @@ def test_incomplete_list_comprehension(): # parser only valid statements generate one. children = parse('(1 for def').children assert [c.type for c in children] == \ - ['error_node', 'error_node', 'newline', 'endmarker'] + ['error_node', 'error_node', 'endmarker'] def test_newline_positions(): @@ -153,7 +153,7 @@ def test_python2_octal(): def test_python3_octal(): module = parse('0o660') if py_version >= 30: - assert module.children[0].children[0].type == 'number' + assert module.children[0].type == 'number' else: assert module.children[0].type == 'error_node' diff --git a/test/test_parser_tree.py b/test/test_parser_tree.py index 8d27fd6..d39c347 100644 --- a/test/test_parser_tree.py +++ b/test/test_parser_tree.py @@ -63,7 +63,7 @@ class TestsFunctionAndLambdaParsing(object): def test_end_pos_line(): # jedi issue #150 - s = "x()\nx( )\nx( )\nx ( )" + s = "x()\nx( )\nx( )\nx ( )\n" module = parse(s) for i, simple_stmt in enumerate(module.children[:-1]): expr_stmt = simple_stmt.children[0] diff --git a/test/test_prefix.py b/test/test_prefix.py index ab0d1d4..2e47ebc 100644 --- a/test/test_prefix.py +++ b/test/test_prefix.py @@ -12,7 +12,6 @@ import parso (' \f ', [' ', '\f', ' ']), (' \f ', [' ', '\f', ' ']), (' \r\n', [' ', '\r\n']), - ('\\', ['\\']), ('\\\n', ['\\\n']), ('\\\r\n', ['\\\r\n']), ('\t\t\n\t', ['\t\t', '\n', '\t']), @@ -43,7 +42,6 @@ def test_simple_prefix_splitting(string, tokens): ('\r\n', ['newline']), ('\f', ['formfeed']), ('\\\n', ['backslash']), - ('\r', ['newline']), ]) def test_prefix_splitting_types(string, types): tree = parso.parse(string)