diff --git a/parso/utils.py b/parso/utils.py index dfc98c8..579c4cc 100644 --- a/parso/utils.py +++ b/parso/utils.py @@ -5,6 +5,20 @@ from ast import literal_eval from parso._compatibility import unicode, total_ordering +# The following is a list in Python that are line breaks in str.splitlines, but +# not in Python. In Python only \r (Carriage Return, 0xD) and \n (Line Feed, +# 0xA) are allowed to split lines. +_NON_LINE_BREAKS = ( + u'\v', # Vertical Tabulation 0xB + u'\f', # Form Feed 0xC + u'\x1C', # File Separator + u'\x1D', # Group Separator + u'\x1E', # Record Separator + u'\x85', # Next Line (NEL - Equivalent to CR+LF. + # Used to mark end-of-line on some IBM mainframes.) + u'\u2028', # Line Separator + u'\u2029', # Paragraph Separator +) Version = namedtuple('Version', 'major, minor, micro') @@ -26,8 +40,13 @@ def split_lines(string, keepends=False): # We have to merge lines that were broken by form feed characters. merge = [] for i, line in enumerate(lst): - if line.endswith('\f'): - merge.append(i) + try: + last_chr = line[-1] + except IndexError: + pass + else: + if last_chr in _NON_LINE_BREAKS: + merge.append(i) for index in reversed(merge): try: diff --git a/test/test_diff_parser.py b/test/test_diff_parser.py index f90c1c1..172e3df 100644 --- a/test/test_diff_parser.py +++ b/test/test_diff_parser.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from textwrap import dedent import logging @@ -952,3 +953,9 @@ def test_wrong_backslash(differ): differ.initialize(code1) differ.parse(code2, parsers=2, copies=2, expect_error_leaves=True) differ.parse(code1, parsers=1, copies=1) + + +def test_random_unicode_characters(differ): + differ.initialize('') + differ.parse('\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1, expect_error_leaves=True) + differ.parse('') diff --git a/test/test_utils.py b/test/test_utils.py index ea32589..3078151 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -29,6 +29,12 @@ import pytest ('\r', ['', ''], False), ('\r', ['\r', ''], True), + + # Invalid line breaks + ('a\vb', ['a\vb'], False), + ('a\vb', ['a\vb'], True), + ('\x1C', ['\x1C'], False), + ('\x1C', ['\x1C'], True), ] ) def test_split_lines(string, expected_result, keepends):