Merge pull request #115 from andviro/master

more robust source encoding detection
2026-02-04 19:22:55 +08:00 · 2013-01-27 06:31:41 -08:00
parent b906fe4209 048608f4b7
commit 0e3cec5b17
2 changed files with 7 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+*~
 *.swp
 *.swo
 .ropeproject
--- a/jedi/modules.py
+++ b/jedi/modules.py
@@ -1,6 +1,6 @@
 from __future__ import with_statement

-from _compatibility import exec_function, unicode
+from _compatibility import exec_function, unicode, is_py25, literal_eval

 import re
 import tokenize
@@ -327,20 +327,18 @@ def source_to_unicode(source, encoding=None):
        http://docs.python.org/2/reference/lexical_analysis.html#encoding-\
                                                                declarations
        """
-        if encoding is not None:
-            return encoding
-
-        if source.startswith('\xef\xbb\xbf'):
+        byte_mark = '\xef\xbb\xbf' if is_py25 else literal_eval(r"b'\xef\xbb\xbf'")
+        if source.startswith(byte_mark):
            # UTF-8 byte-order mark
            return 'utf-8'

-        first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source).group(0)
-        possible_encoding = re.match("coding[=:]\s*([-\w.]+)", first_two_lines)
+        first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', str(source)).group(0)
+        possible_encoding = re.search(r"coding[=:]\s*([-\w.]+)", first_two_lines)
        if possible_encoding:
            return possible_encoding.group(1)
        else:
            # the default if nothing else has been set -> PEP 263
-            return 'iso-8859-1'
+            return encoding if encoding is not None else 'iso-8859-1'

    if isinstance(source, unicode):
        # only cast str/bytes