added source_to_unicode method to deal with different encodings, davidhalter/jedi-vim#48

2025-12-21 13:01:14 +08:00 · 2012-12-19 20:35:24 +01:00
parent 15510a4c3b
commit 5e2e0a8f45
4 changed files with 49 additions and 11 deletions
--- a/jedi/api.py
+++ b/jedi/api.py
@@ -61,18 +61,14 @@ class Script(object):
    :type col: int
    :param source_path: The path in the os, the current module is in.
    :type source_path: string or None
-    :param source_encoding: encoding for decoding `source`, when it
+    :param source_encoding: encoding for decoding `source`, if it
                            is not a `unicode` object.
    :type source_encoding: string
    """
    def __init__(self, source, line, column, source_path,
                                 source_encoding='utf-8'):
        debug.reset_time()
-        try:
-            source = unicode(source, source_encoding, 'replace')
-            # Use 'replace' over 'ignore' to hold code structure.
-        except TypeError:  # `source` is already a unicode object
-            pass
+        source = modules.source_to_unicode(source, source_encoding)
        self.pos = line, column
        self.module = modules.ModuleWithCursor(source_path, source=source,
                                                            position=self.pos)
--- a/jedi/dynamic.py
+++ b/jedi/dynamic.py
@@ -42,7 +42,7 @@ def get_directory_modules_for_name(mods, name):

    def check_fs(path):
        with open(path) as f:
-            source = f.read()
+            source = modules.source_to_unicode(f.read())
            if name in source:
                return modules.Module(path, source).parser.module

--- a/jedi/modules.py
+++ b/jedi/modules.py
@@ -26,7 +26,7 @@ class Module(builtin.CachedModule):
    """
    def __init__(self, path, source):
        super(Module, self).__init__(path=path)
-        self.source = source
+        self.source = source_to_unicode(source)
        self._line_cache = None

    def _get_source(self):
@@ -311,3 +311,33 @@ def detect_django_path(module_path):
        except IOError:
            pass
    return result
+
+
+def source_to_unicode(source, encoding=None):
+    def detect_encoding():
+        """ For the implementation of encoding definitions in Python, look at:
+        http://www.python.org/dev/peps/pep-0263/
+        http://docs.python.org/2/reference/lexical_analysis.html#encoding-\
+                                                                declarations
+        """
+        if encoding is not None:
+            return encoding
+
+        if source.startswith('\xef\xbb\xbf'):
+            # UTF-8 byte-order mark
+            return 'utf-8'
+
+        first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source).group(0)
+        possible_encoding = re.match("coding[=:]\s*([-\w.]+)", first_two_lines)
+        if possible_encoding:
+            return possible_encoding.group(1)
+        else:
+            # the default if nothing else has been set -> PEP 263
+            return 'iso-8859-1'
+
+    if isinstance(source, unicode):
+        # only cast str/bytes
+        return source
+
+    # cast to unicode by default
+    return unicode(source, detect_encoding(), 'replace')
--- a/test/regression.py
+++ b/test/regression.py
@@ -251,10 +251,22 @@ class TestRegression(Base):
        assert len(api.Script(s, 1, 15, '/').get_definition()) == 1
        assert len(api.Script(s, 1, 10, '/').get_definition()) == 1

+
    def test_unicode_script(self):
        """ normally no unicode objects are being used. (<=2.7) """
-        s = unicode("import datetime; datetime.d")
-        assert len(self.complete(s))
+        s = unicode("import datetime; datetime.timedelta")
+        completions = self.complete(s)
+        assert len(completions)
+        assert type(completions[0].description) is unicode
+
+        s = utf8("author='öä'; author")
+        completions = self.complete(s)
+        assert type(completions[0].description) is unicode
+
+        s = utf8("#-*- coding: iso-8859-1 -*-\nauthor='öä'; author")
+        s = s.encode('latin-1')
+        completions = self.complete(s)
+        assert type(completions[0].description) is unicode

    def test_multibyte_script(self):
        """ `jedi.Script` must accept multi-byte string source. """