diff --git a/jedi/common.py b/jedi/common.py index 0e9afd11..6696eadc 100644 --- a/jedi/common.py +++ b/jedi/common.py @@ -127,7 +127,7 @@ def source_to_unicode(source, encoding=None): # UTF-8 byte-order mark return 'utf-8' - first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', str(source)).group(0) + first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source.decode('iso-8859-1')).group(0) possible_encoding = re.search(r"coding[=:]\s*([-\w.]+)", first_two_lines) if possible_encoding: diff --git a/test/test_regression.py b/test/test_regression.py index 94132acd..2ea38b86 100644 --- a/test/test_regression.py +++ b/test/test_regression.py @@ -14,6 +14,7 @@ import jedi from jedi._compatibility import u from jedi import Script from jedi import api +from jedi import common from jedi.evaluate import imports from jedi.parser import Parser, load_grammar @@ -156,6 +157,15 @@ class TestRegression(TestCase): "abc()." assert Script(s).completions() + def test_source_to_unicode_unicode_text(self): + source = ( + b"# vim: fileencoding=utf-8\n" + b"# \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a\n" + ) + actual = common.source_to_unicode(source) + expected = source.decode('utf-8') + assert actual == expected + def test_loading_unicode_files_with_bad_global_charset(monkeypatch, tmpdir): dirname = str(tmpdir.mkdir('jedi-test'))