mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-08 21:54:54 +08:00
Forgot to add the utils file.
This commit is contained in:
73
parso/utils.py
Normal file
73
parso/utils.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
import re
|
||||||
|
from ast import literal_eval
|
||||||
|
|
||||||
|
|
||||||
|
def splitlines(string, keepends=False):
|
||||||
|
"""
|
||||||
|
A splitlines for Python code. In contrast to Python's ``str.splitlines``,
|
||||||
|
looks at form feeds and other special characters as normal text. Just
|
||||||
|
splits ``\n`` and ``\r\n``.
|
||||||
|
Also different: Returns ``['']`` for an empty string input.
|
||||||
|
|
||||||
|
In Python 2.7 form feeds are used as normal characters when using
|
||||||
|
str.splitlines. However in Python 3 somewhere there was a decision to split
|
||||||
|
also on form feeds.
|
||||||
|
"""
|
||||||
|
if keepends:
|
||||||
|
lst = string.splitlines(True)
|
||||||
|
|
||||||
|
# We have to merge lines that were broken by form feed characters.
|
||||||
|
merge = []
|
||||||
|
for i, line in enumerate(lst):
|
||||||
|
if line.endswith('\f'):
|
||||||
|
merge.append(i)
|
||||||
|
|
||||||
|
for index in reversed(merge):
|
||||||
|
try:
|
||||||
|
lst[index] = lst[index] + lst[index + 1]
|
||||||
|
del lst[index + 1]
|
||||||
|
except IndexError:
|
||||||
|
# index + 1 can be empty and therefore there's no need to
|
||||||
|
# merge.
|
||||||
|
pass
|
||||||
|
|
||||||
|
# The stdlib's implementation of the end is inconsistent when calling
|
||||||
|
# it with/without keepends. One time there's an empty string in the
|
||||||
|
# end, one time there's none.
|
||||||
|
if string.endswith('\n') or string == '':
|
||||||
|
lst.append('')
|
||||||
|
return lst
|
||||||
|
else:
|
||||||
|
return re.split('\n|\r\n', string)
|
||||||
|
|
||||||
|
|
||||||
|
def source_to_unicode(source, encoding=None):
|
||||||
|
def detect_encoding():
|
||||||
|
"""
|
||||||
|
For the implementation of encoding definitions in Python, look at:
|
||||||
|
- http://www.python.org/dev/peps/pep-0263/
|
||||||
|
- http://docs.python.org/2/reference/lexical_analysis.html#encoding-declarations
|
||||||
|
"""
|
||||||
|
byte_mark = literal_eval(r"b'\xef\xbb\xbf'")
|
||||||
|
if source.startswith(byte_mark):
|
||||||
|
# UTF-8 byte-order mark
|
||||||
|
return 'utf-8'
|
||||||
|
|
||||||
|
first_two_lines = re.match(br'(?:[^\n]*\n){0,2}', source).group(0)
|
||||||
|
possible_encoding = re.search(br"coding[=:]\s*([-\w.]+)",
|
||||||
|
first_two_lines)
|
||||||
|
if possible_encoding:
|
||||||
|
return possible_encoding.group(1)
|
||||||
|
else:
|
||||||
|
# the default if nothing else has been set -> PEP 263
|
||||||
|
return encoding if encoding is not None else 'utf-8'
|
||||||
|
|
||||||
|
if isinstance(source, unicode):
|
||||||
|
# only cast str/bytes
|
||||||
|
return source
|
||||||
|
|
||||||
|
encoding = detect_encoding()
|
||||||
|
if not isinstance(encoding, unicode):
|
||||||
|
encoding = unicode(encoding, 'utf-8', 'replace')
|
||||||
|
# cast to unicode by default
|
||||||
|
return unicode(source, encoding, 'replace')
|
||||||
Reference in New Issue
Block a user