diff --git a/jedi/api.py b/jedi/api.py index 3a138eac..e3a49ce1 100644 --- a/jedi/api.py +++ b/jedi/api.py @@ -61,18 +61,14 @@ class Script(object): :type col: int :param source_path: The path in the os, the current module is in. :type source_path: string or None - :param source_encoding: encoding for decoding `source`, when it + :param source_encoding: encoding for decoding `source`, if it is not a `unicode` object. :type source_encoding: string """ def __init__(self, source, line, column, source_path, - source_encoding='utf-8'): + source_encoding='utf-8'): debug.reset_time() - try: - source = unicode(source, source_encoding, 'replace') - # Use 'replace' over 'ignore' to hold code structure. - except TypeError: # `source` is already a unicode object - pass + source = modules.source_to_unicode(source, source_encoding) self.pos = line, column self.module = modules.ModuleWithCursor(source_path, source=source, position=self.pos) diff --git a/jedi/dynamic.py b/jedi/dynamic.py index 4b7ca505..41ab3a09 100644 --- a/jedi/dynamic.py +++ b/jedi/dynamic.py @@ -42,7 +42,7 @@ def get_directory_modules_for_name(mods, name): def check_fs(path): with open(path) as f: - source = f.read() + source = modules.source_to_unicode(f.read()) if name in source: return modules.Module(path, source).parser.module diff --git a/jedi/modules.py b/jedi/modules.py index 5db2d682..3a3cc25e 100644 --- a/jedi/modules.py +++ b/jedi/modules.py @@ -26,7 +26,7 @@ class Module(builtin.CachedModule): """ def __init__(self, path, source): super(Module, self).__init__(path=path) - self.source = source + self.source = source_to_unicode(source) self._line_cache = None def _get_source(self): @@ -311,3 +311,33 @@ def detect_django_path(module_path): except IOError: pass return result + + +def source_to_unicode(source, encoding=None): + def detect_encoding(): + """ For the implementation of encoding definitions in Python, look at: + http://www.python.org/dev/peps/pep-0263/ + http://docs.python.org/2/reference/lexical_analysis.html#encoding-\ + declarations + """ + if encoding is not None: + return encoding + + if source.startswith('\xef\xbb\xbf'): + # UTF-8 byte-order mark + return 'utf-8' + + first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source).group(0) + possible_encoding = re.match("coding[=:]\s*([-\w.]+)", first_two_lines) + if possible_encoding: + return possible_encoding.group(1) + else: + # the default if nothing else has been set -> PEP 263 + return 'iso-8859-1' + + if isinstance(source, unicode): + # only cast str/bytes + return source + + # cast to unicode by default + return unicode(source, detect_encoding(), 'replace') diff --git a/test/regression.py b/test/regression.py index ad493a8b..6cc9690b 100755 --- a/test/regression.py +++ b/test/regression.py @@ -251,10 +251,22 @@ class TestRegression(Base): assert len(api.Script(s, 1, 15, '/').get_definition()) == 1 assert len(api.Script(s, 1, 10, '/').get_definition()) == 1 + def test_unicode_script(self): """ normally no unicode objects are being used. (<=2.7) """ - s = unicode("import datetime; datetime.d") - assert len(self.complete(s)) + s = unicode("import datetime; datetime.timedelta") + completions = self.complete(s) + assert len(completions) + assert type(completions[0].description) is unicode + + s = utf8("author='öä'; author") + completions = self.complete(s) + assert type(completions[0].description) is unicode + + s = utf8("#-*- coding: iso-8859-1 -*-\nauthor='öä'; author") + s = s.encode('latin-1') + completions = self.complete(s) + assert type(completions[0].description) is unicode def test_multibyte_script(self): """ `jedi.Script` must accept multi-byte string source. """