more robust source encoding detection

This commit is contained in:
Andrew Rodionoff
2013-01-25 22:57:23 +04:00
parent 23f36c86d7
commit 9c152119ad
2 changed files with 3 additions and 5 deletions

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
*~
*.swp *.swp
*.swo *.swo
.ropeproject .ropeproject

View File

@@ -328,20 +328,17 @@ def source_to_unicode(source, encoding=None):
http://docs.python.org/2/reference/lexical_analysis.html#encoding-\ http://docs.python.org/2/reference/lexical_analysis.html#encoding-\
declarations declarations
""" """
if encoding is not None:
return encoding
if source.startswith('\xef\xbb\xbf'): if source.startswith('\xef\xbb\xbf'):
# UTF-8 byte-order mark # UTF-8 byte-order mark
return 'utf-8' return 'utf-8'
first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source).group(0) first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source).group(0)
possible_encoding = re.match("coding[=:]\s*([-\w.]+)", first_two_lines) possible_encoding = re.search(r"coding[=:]\s*([-\w.]+)", first_two_lines)
if possible_encoding: if possible_encoding:
return possible_encoding.group(1) return possible_encoding.group(1)
else: else:
# the default if nothing else has been set -> PEP 263 # the default if nothing else has been set -> PEP 263
return 'iso-8859-1' return encoding if encoding is not None else 'iso-8859-1'
if isinstance(source, unicode): if isinstance(source, unicode):
# only cast str/bytes # only cast str/bytes