Merge pull request #115 from andviro/master

more robust source encoding detection
This commit is contained in:
David Halter
2013-01-27 06:31:41 -08:00
2 changed files with 7 additions and 8 deletions

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
*~
*.swp
*.swo
.ropeproject

View File

@@ -1,6 +1,6 @@
from __future__ import with_statement
from _compatibility import exec_function, unicode
from _compatibility import exec_function, unicode, is_py25, literal_eval
import re
import tokenize
@@ -327,20 +327,18 @@ def source_to_unicode(source, encoding=None):
http://docs.python.org/2/reference/lexical_analysis.html#encoding-\
declarations
"""
if encoding is not None:
return encoding
if source.startswith('\xef\xbb\xbf'):
byte_mark = '\xef\xbb\xbf' if is_py25 else literal_eval(r"b'\xef\xbb\xbf'")
if source.startswith(byte_mark):
# UTF-8 byte-order mark
return 'utf-8'
first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source).group(0)
possible_encoding = re.match("coding[=:]\s*([-\w.]+)", first_two_lines)
first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', str(source)).group(0)
possible_encoding = re.search(r"coding[=:]\s*([-\w.]+)", first_two_lines)
if possible_encoding:
return possible_encoding.group(1)
else:
# the default if nothing else has been set -> PEP 263
return 'iso-8859-1'
return encoding if encoding is not None else 'iso-8859-1'
if isinstance(source, unicode):
# only cast str/bytes