Forgot to add the utils file.

This commit is contained in:
Dave Halter
2017-05-15 11:35:20 -04:00
parent 539ab41186
commit f784e28eec

73
parso/utils.py Normal file
View File

@@ -0,0 +1,73 @@
import re
from ast import literal_eval
def splitlines(string, keepends=False):
"""
A splitlines for Python code. In contrast to Python's ``str.splitlines``,
looks at form feeds and other special characters as normal text. Just
splits ``\n`` and ``\r\n``.
Also different: Returns ``['']`` for an empty string input.
In Python 2.7 form feeds are used as normal characters when using
str.splitlines. However in Python 3 somewhere there was a decision to split
also on form feeds.
"""
if keepends:
lst = string.splitlines(True)
# We have to merge lines that were broken by form feed characters.
merge = []
for i, line in enumerate(lst):
if line.endswith('\f'):
merge.append(i)
for index in reversed(merge):
try:
lst[index] = lst[index] + lst[index + 1]
del lst[index + 1]
except IndexError:
# index + 1 can be empty and therefore there's no need to
# merge.
pass
# The stdlib's implementation of the end is inconsistent when calling
# it with/without keepends. One time there's an empty string in the
# end, one time there's none.
if string.endswith('\n') or string == '':
lst.append('')
return lst
else:
return re.split('\n|\r\n', string)
def source_to_unicode(source, encoding=None):
def detect_encoding():
"""
For the implementation of encoding definitions in Python, look at:
- http://www.python.org/dev/peps/pep-0263/
- http://docs.python.org/2/reference/lexical_analysis.html#encoding-declarations
"""
byte_mark = literal_eval(r"b'\xef\xbb\xbf'")
if source.startswith(byte_mark):
# UTF-8 byte-order mark
return 'utf-8'
first_two_lines = re.match(br'(?:[^\n]*\n){0,2}', source).group(0)
possible_encoding = re.search(br"coding[=:]\s*([-\w.]+)",
first_two_lines)
if possible_encoding:
return possible_encoding.group(1)
else:
# the default if nothing else has been set -> PEP 263
return encoding if encoding is not None else 'utf-8'
if isinstance(source, unicode):
# only cast str/bytes
return source
encoding = detect_encoding()
if not isinstance(encoding, unicode):
encoding = unicode(encoding, 'utf-8', 'replace')
# cast to unicode by default
return unicode(source, encoding, 'replace')