more robust source encoding detection

This commit is contained in:
Andrew Rodionoff
2013-01-25 22:57:23 +04:00
parent 23f36c86d7
commit 9c152119ad
2 changed files with 3 additions and 5 deletions

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
*~
*.swp
*.swo
.ropeproject

View File

@@ -328,20 +328,17 @@ def source_to_unicode(source, encoding=None):
http://docs.python.org/2/reference/lexical_analysis.html#encoding-\
declarations
"""
if encoding is not None:
return encoding
if source.startswith('\xef\xbb\xbf'):
# UTF-8 byte-order mark
return 'utf-8'
first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source).group(0)
possible_encoding = re.match("coding[=:]\s*([-\w.]+)", first_two_lines)
possible_encoding = re.search(r"coding[=:]\s*([-\w.]+)", first_two_lines)
if possible_encoding:
return possible_encoding.group(1)
else:
# the default if nothing else has been set -> PEP 263
return 'iso-8859-1'
return encoding if encoding is not None else 'iso-8859-1'
if isinstance(source, unicode):
# only cast str/bytes