Always load source files in binary mode.

Source files can be in any coding system, provided Python can read a coding: line at the beginning of the file. So source files should be loaded in binary format and decoded according to that line, not assumed to be in the default coding system. Fixes #398.
2014-04-23 16:25:12 +02:00
parent 67f49da15c
commit 9ed0dc4861
4 changed files with 24 additions and 8 deletions
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@@ -22,5 +22,6 @@ Akinori Hattori (@hattya)
 srusskih (@srusskih)
 Steven Silvester (@blink1073)
 Colin Duquesnoy (@ColinDuquesnoy) <colin.duquesnoy@gmail.com>
+Jorgen Schaefer (@jorgenschaefer) <contact@jorgenschaefer.de>

 Note: (@user) means a github user name.
--- a/jedi/_compatibility.py
+++ b/jedi/_compatibility.py
@@ -31,9 +31,7 @@ def find_module_py33(string, path=None):
            module_file = None
        else:
            module_path = loader.get_filename(string)
-            module_ext = os.path.splitext(module_path)[1]
-            mode = 'rb' if module_ext in ['.pyc', '.so', '.pyd'] else 'r'
-            module_file = open(module_path, mode)
+            module_file = open(module_path, 'rb')
    except AttributeError:
        # ExtensionLoader has not attribute get_filename, instead it has a
        # path attribute that we can use to retrieve the module path
--- a/jedi/evaluate/imports.py
+++ b/jedi/evaluate/imports.py
@@ -331,8 +331,8 @@ class _Importer(object):
                        deeper_paths.append(new)
                return follow_path(directories, deeper_paths)

-        with open(os.path.join(found_path, '__init__.py')) as f:
-            content = f.read()
+        with open(os.path.join(found_path, '__init__.py'), 'rb') as f:
+            content = common.source_to_unicode(f.read())
            # these are strings that need to be used for namespace packages,
            # the first one is ``pkgutil``, the second ``pkg_resources``.
            options = 'declare_namespace(__name__)', 'extend_path(__path__'
@@ -406,7 +406,7 @@ class _Importer(object):
            # is a directory module
            if is_package_directory:
                path += '/__init__.py'
-                with open(path) as f:
+                with open(path, 'rb') as f:
                    source = f.read()
            else:
                source = current_namespace[0].read()
@@ -454,7 +454,7 @@ def load_module(path=None, source=None, name=None):
    def load(source):
        if path is not None and path.endswith('.py'):
            if source is None:
-                with open(path) as f:
+                with open(path, 'rb') as f:
                    source = f.read()
        else:
            return compiled.load_module(path, name)
@@ -481,7 +481,7 @@ def get_modules_containing_name(mods, name):
                return None

    def check_fs(path):
-        with open(path) as f:
+        with open(path, 'rb') as f:
            source = source_to_unicode(f.read())
            if name in source:
                return load_module(path, source)
--- a/test/test_regression.py
+++ b/test/test_regression.py
@@ -4,6 +4,7 @@ found a good place in any other testing module.
 """

 import os
+import sys
 import textwrap

 from .helpers import TestCase, cwd_at
@@ -154,3 +155,19 @@ class TestRegression(TestCase):
            "    yield 1\n" \
            "abc()."
        assert Script(s).completions()
+
+
+def test_loading_unicode_files_with_bad_global_charset(monkeypatch, tmpdir):
+    dirname = str(tmpdir.mkdir('jedi-test'))
+    filename1 = os.path.join(dirname, 'test1.py')
+    filename2 = os.path.join(dirname, 'test2.py')
+    if sys.version_info < (3, 0):
+        data = "# coding: latin-1\nfoo = 'm\xf6p'\n"
+    else:
+        data = "# coding: latin-1\nfoo = 'm\xf6p'\n".encode("latin-1")
+
+    with open(filename1, "wb") as f:
+        f.write(data)
+    s = Script("from test1 import foo\nfoo.",
+               line=2, column=4, path=filename2)
+    s.complete()