more robust source encoding detection

2025-12-06 14:04:26 +08:00 · 2013-01-25 22:57:23 +04:00
parent 23f36c86d7
commit 9c152119ad
2 changed files with 3 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+*~
 *.swp
 *.swo
 .ropeproject
--- a/jedi/modules.py
+++ b/jedi/modules.py
@@ -328,20 +328,17 @@ def source_to_unicode(source, encoding=None):
        http://docs.python.org/2/reference/lexical_analysis.html#encoding-\
                                                                declarations
        """
-        if encoding is not None:
-            return encoding
-
        if source.startswith('\xef\xbb\xbf'):
            # UTF-8 byte-order mark
            return 'utf-8'

        first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source).group(0)
-        possible_encoding = re.match("coding[=:]\s*([-\w.]+)", first_two_lines)
+        possible_encoding = re.search(r"coding[=:]\s*([-\w.]+)", first_two_lines)
        if possible_encoding:
            return possible_encoding.group(1)
        else:
            # the default if nothing else has been set -> PEP 263
-            return 'iso-8859-1'
+            return encoding if encoding is not None else 'iso-8859-1'

    if isinstance(source, unicode):
        # only cast str/bytes