unicode issues with docstrings should be gone, fixes #420

2025-12-09 23:34:45 +08:00 · 2014-07-18 17:43:25 +02:00
parent e07f51387f
commit 2616143d10
2 changed files with 15 additions and 3 deletions
--- a/jedi/parser/representation.py
+++ b/jedi/parser/representation.py
@@ -85,7 +85,13 @@ class DocstringMixin(object):
        """ Returns a cleaned version of the docstring token. """
        try:
            # Returns a literal cleaned version of the ``Token``.
-            return unicode(cleandoc(literal_eval(self._doc_token.string)))
+            cleaned = cleandoc(literal_eval(self._doc_token.string))
+            # Since we want the docstr output to be always unicode, just force
+            # it.
+            if is_py3 or isinstance(cleaned, unicode):
+                return cleaned
+            else:
+                return unicode(cleaned, 'UTF-8', 'replace')
        except AttributeError:
            return u('')

--- a/test/test_parser/test_parser.py
+++ b/test/test_parser/test_parser.py
@@ -1,4 +1,6 @@
-from jedi._compatibility import u
+# -*- coding: utf-8 -*-
+
+from jedi._compatibility import u, is_py3
 from jedi.parser import Parser
 from jedi.parser.user_context import UserContextParser
 from jedi.parser import representation as pr
@@ -136,4 +138,8 @@ def test_hex_values_in_docstring():
            return 1
        '''

-    assert Parser(dedent(u(source))).module.subscopes[0].raw_doc == '\xff'
+    doc = Parser(dedent(u(source))).module.subscopes[0].raw_doc
+    if is_py3:
+        assert doc == '\xff'
+    else:
+        assert doc == u('<EFBFBD>')