mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-08 05:34:51 +08:00
Some characters like Vertical Tab or File Separator were used as line separators. This is not legal. Line Separators in Python are only Carriage Return \r and Line Feed \n.
176 lines
5.9 KiB
Python
176 lines
5.9 KiB
Python
from collections import namedtuple
|
|
import re
|
|
import sys
|
|
from ast import literal_eval
|
|
|
|
from parso._compatibility import unicode, total_ordering
|
|
|
|
# The following is a list in Python that are line breaks in str.splitlines, but
|
|
# not in Python. In Python only \r (Carriage Return, 0xD) and \n (Line Feed,
|
|
# 0xA) are allowed to split lines.
|
|
_NON_LINE_BREAKS = (
|
|
u'\v', # Vertical Tabulation 0xB
|
|
u'\f', # Form Feed 0xC
|
|
u'\x1C', # File Separator
|
|
u'\x1D', # Group Separator
|
|
u'\x1E', # Record Separator
|
|
u'\x85', # Next Line (NEL - Equivalent to CR+LF.
|
|
# Used to mark end-of-line on some IBM mainframes.)
|
|
u'\u2028', # Line Separator
|
|
u'\u2029', # Paragraph Separator
|
|
)
|
|
|
|
Version = namedtuple('Version', 'major, minor, micro')
|
|
|
|
|
|
def split_lines(string, keepends=False):
|
|
r"""
|
|
Intended for Python code. In contrast to Python's :py:meth:`str.splitlines`,
|
|
looks at form feeds and other special characters as normal text. Just
|
|
splits ``\n`` and ``\r\n``.
|
|
Also different: Returns ``[""]`` for an empty string input.
|
|
|
|
In Python 2.7 form feeds are used as normal characters when using
|
|
str.splitlines. However in Python 3 somewhere there was a decision to split
|
|
also on form feeds.
|
|
"""
|
|
if keepends:
|
|
lst = string.splitlines(True)
|
|
|
|
# We have to merge lines that were broken by form feed characters.
|
|
merge = []
|
|
for i, line in enumerate(lst):
|
|
try:
|
|
last_chr = line[-1]
|
|
except IndexError:
|
|
pass
|
|
else:
|
|
if last_chr in _NON_LINE_BREAKS:
|
|
merge.append(i)
|
|
|
|
for index in reversed(merge):
|
|
try:
|
|
lst[index] = lst[index] + lst[index + 1]
|
|
del lst[index + 1]
|
|
except IndexError:
|
|
# index + 1 can be empty and therefore there's no need to
|
|
# merge.
|
|
pass
|
|
|
|
# The stdlib's implementation of the end is inconsistent when calling
|
|
# it with/without keepends. One time there's an empty string in the
|
|
# end, one time there's none.
|
|
if string.endswith('\n') or string.endswith('\r') or string == '':
|
|
lst.append('')
|
|
return lst
|
|
else:
|
|
return re.split(r'\n|\r\n|\r', string)
|
|
|
|
|
|
def python_bytes_to_unicode(source, encoding='utf-8', errors='strict'):
|
|
"""
|
|
Checks for unicode BOMs and PEP 263 encoding declarations. Then returns a
|
|
unicode object like in :py:meth:`bytes.decode`.
|
|
|
|
:param encoding: See :py:meth:`bytes.decode` documentation.
|
|
:param errors: See :py:meth:`bytes.decode` documentation. ``errors`` can be
|
|
``'strict'``, ``'replace'`` or ``'ignore'``.
|
|
"""
|
|
def detect_encoding():
|
|
"""
|
|
For the implementation of encoding definitions in Python, look at:
|
|
- http://www.python.org/dev/peps/pep-0263/
|
|
- http://docs.python.org/2/reference/lexical_analysis.html#encoding-declarations
|
|
"""
|
|
byte_mark = literal_eval(r"b'\xef\xbb\xbf'")
|
|
if source.startswith(byte_mark):
|
|
# UTF-8 byte-order mark
|
|
return 'utf-8'
|
|
|
|
first_two_lines = re.match(br'(?:[^\n]*\n){0,2}', source).group(0)
|
|
possible_encoding = re.search(br"coding[=:]\s*([-\w.]+)",
|
|
first_two_lines)
|
|
if possible_encoding:
|
|
return possible_encoding.group(1)
|
|
else:
|
|
# the default if nothing else has been set -> PEP 263
|
|
return encoding
|
|
|
|
if isinstance(source, unicode):
|
|
# only cast str/bytes
|
|
return source
|
|
|
|
encoding = detect_encoding()
|
|
if not isinstance(encoding, unicode):
|
|
encoding = unicode(encoding, 'utf-8', 'replace')
|
|
|
|
# Cast to unicode
|
|
return unicode(source, encoding, errors)
|
|
|
|
|
|
def version_info():
|
|
"""
|
|
Returns a namedtuple of parso's version, similar to Python's
|
|
``sys.version_info``.
|
|
"""
|
|
from parso import __version__
|
|
tupl = re.findall(r'[a-z]+|\d+', __version__)
|
|
return Version(*[x if i == 3 else int(x) for i, x in enumerate(tupl)])
|
|
|
|
|
|
def _parse_version(version):
|
|
match = re.match(r'(\d+)(?:\.(\d)(?:\.\d+)?)?$', version)
|
|
if match is None:
|
|
raise ValueError('The given version is not in the right format. '
|
|
'Use something like "3.2" or "3".')
|
|
|
|
major = int(match.group(1))
|
|
minor = match.group(2)
|
|
if minor is None:
|
|
# Use the latest Python in case it's not exactly defined, because the
|
|
# grammars are typically backwards compatible?
|
|
if major == 2:
|
|
minor = "7"
|
|
elif major == 3:
|
|
minor = "6"
|
|
else:
|
|
raise NotImplementedError("Sorry, no support yet for those fancy new/old versions.")
|
|
minor = int(minor)
|
|
return PythonVersionInfo(major, minor)
|
|
|
|
|
|
@total_ordering
|
|
class PythonVersionInfo(namedtuple('Version', 'major, minor')):
|
|
def __gt__(self, other):
|
|
if isinstance(other, tuple):
|
|
if len(other) != 2:
|
|
raise ValueError("Can only compare to tuples of length 2.")
|
|
return (self.major, self.minor) > other
|
|
super(PythonVersionInfo, self).__gt__(other)
|
|
|
|
return (self.major, self.minor)
|
|
|
|
def __eq__(self, other):
|
|
if isinstance(other, tuple):
|
|
if len(other) != 2:
|
|
raise ValueError("Can only compare to tuples of length 2.")
|
|
return (self.major, self.minor) == other
|
|
super(PythonVersionInfo, self).__eq__(other)
|
|
|
|
def __ne__(self, other):
|
|
return not self.__eq__(other)
|
|
|
|
|
|
def parse_version_string(version=None):
|
|
"""
|
|
Checks for a valid version number (e.g. `3.2` or `2.7.1` or `3`) and
|
|
returns a corresponding version info that is always two characters long in
|
|
decimal.
|
|
"""
|
|
if version is None:
|
|
version = '%s.%s' % sys.version_info[:2]
|
|
if not isinstance(version, (unicode, str)):
|
|
raise TypeError("version must be a string like 3.2.")
|
|
|
|
return _parse_version(version)
|