From 9c152119ada6f508877951afb37ca5d2f81d0c89 Mon Sep 17 00:00:00 2001 From: Andrew Rodionoff Date: Fri, 25 Jan 2013 22:57:23 +0400 Subject: [PATCH 1/3] more robust source encoding detection --- .gitignore | 1 + jedi/modules.py | 7 ++----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 1f303fe4..5c08b8e0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*~ *.swp *.swo .ropeproject diff --git a/jedi/modules.py b/jedi/modules.py index 75c89ff2..a55290d7 100644 --- a/jedi/modules.py +++ b/jedi/modules.py @@ -328,20 +328,17 @@ def source_to_unicode(source, encoding=None): http://docs.python.org/2/reference/lexical_analysis.html#encoding-\ declarations """ - if encoding is not None: - return encoding - if source.startswith('\xef\xbb\xbf'): # UTF-8 byte-order mark return 'utf-8' first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source).group(0) - possible_encoding = re.match("coding[=:]\s*([-\w.]+)", first_two_lines) + possible_encoding = re.search(r"coding[=:]\s*([-\w.]+)", first_two_lines) if possible_encoding: return possible_encoding.group(1) else: # the default if nothing else has been set -> PEP 263 - return 'iso-8859-1' + return encoding if encoding is not None else 'iso-8859-1' if isinstance(source, unicode): # only cast str/bytes From 0315da9699a8fde88828887f0f138b9892edb94d Mon Sep 17 00:00:00 2001 From: Andrew Rodionoff Date: Sat, 26 Jan 2013 02:10:26 +0400 Subject: [PATCH 2/3] compatibility with python3 fix --- jedi/modules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jedi/modules.py b/jedi/modules.py index a55290d7..1f069e2e 100644 --- a/jedi/modules.py +++ b/jedi/modules.py @@ -328,11 +328,11 @@ def source_to_unicode(source, encoding=None): http://docs.python.org/2/reference/lexical_analysis.html#encoding-\ declarations """ - if source.startswith('\xef\xbb\xbf'): + if source.startswith(b'\xef\xbb\xbf'): # UTF-8 byte-order mark return 'utf-8' - first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', source).group(0) + first_two_lines = re.match(r'(?:[^\n]*\n){0,2}', str(source)).group(0) possible_encoding = re.search(r"coding[=:]\s*([-\w.]+)", first_two_lines) if possible_encoding: return possible_encoding.group(1) From 048608f4b7c8f37bcfdbde76457fa68568447a05 Mon Sep 17 00:00:00 2001 From: Andrew Rodionoff Date: Sat, 26 Jan 2013 09:26:04 +0400 Subject: [PATCH 3/3] compatibility with python2.5 --- jedi/modules.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jedi/modules.py b/jedi/modules.py index 1f069e2e..f6a84243 100644 --- a/jedi/modules.py +++ b/jedi/modules.py @@ -1,6 +1,6 @@ from __future__ import with_statement -from _compatibility import exec_function, unicode +from _compatibility import exec_function, unicode, is_py25, literal_eval import re import tokenize @@ -328,7 +328,8 @@ def source_to_unicode(source, encoding=None): http://docs.python.org/2/reference/lexical_analysis.html#encoding-\ declarations """ - if source.startswith(b'\xef\xbb\xbf'): + byte_mark = '\xef\xbb\xbf' if is_py25 else literal_eval(r"b'\xef\xbb\xbf'") + if source.startswith(byte_mark): # UTF-8 byte-order mark return 'utf-8'