From c5fcebde8233d891041b6d798bbde8b75853f227 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Sun, 23 Feb 2014 11:29:00 +0100 Subject: [PATCH] changed _compatibility.utf8 -> 'u' and removed a lot of the issues with the now enforced unicode source input of the parser --- jedi/_compatibility.py | 6 +++--- jedi/api/interpreter.py | 4 +++- jedi/evaluate/compiled/fake.py | 4 ++-- jedi/parser/fast.py | 4 ++-- jedi/parser/token.py | 6 +++--- jedi/parser/tokenize.py | 15 +++++++-------- jedi/parser/user_context.py | 9 +++++---- scripts/wx_check.py | 2 ++ test/test_api/test_unicode.py | 20 ++++++++++---------- test/test_evaluate/test_absolute_import.py | 7 ++++--- 10 files changed, 41 insertions(+), 36 deletions(-) diff --git a/jedi/_compatibility.py b/jedi/_compatibility.py index edfec111..a862b74b 100644 --- a/jedi/_compatibility.py +++ b/jedi/_compatibility.py @@ -90,11 +90,11 @@ except NameError: unicode = str if is_py3: - utf8 = lambda s: s + u = lambda s: s else: - utf8 = lambda s: s.decode('utf-8') + u = lambda s: s.decode('utf-8') -utf8.__doc__ = """ +u.__doc__ = """ Decode a raw string into unicode object. Do nothing in Python 3. """ diff --git a/jedi/api/interpreter.py b/jedi/api/interpreter.py index cf38dff5..48093046 100644 --- a/jedi/api/interpreter.py +++ b/jedi/api/interpreter.py @@ -3,6 +3,7 @@ import re from jedi._compatibility import builtins from jedi import debug +from jedi.common import source_to_unicode from jedi.cache import underscore_memoization from jedi.evaluate import compiled from jedi.evaluate.compiled.fake import get_module @@ -68,7 +69,8 @@ class LazyName(helpers.FakeName): if path.endswith('.py'): # cut the `c` from `.pyc` with open(path) as f: - mod = FastParser(f.read(), path[:-1]).module + source = source_to_unicode(f.read()) + mod = FastParser(source, path[:-1]).module if not parser_path: return mod found = self._evaluator.eval_call_path(iter(parser_path), mod, None) diff --git a/jedi/evaluate/compiled/fake.py b/jedi/evaluate/compiled/fake.py index f4e44c9a..091c3129 100644 --- a/jedi/evaluate/compiled/fake.py +++ b/jedi/evaluate/compiled/fake.py @@ -7,7 +7,7 @@ mixing in Python code, the autocompletion should work much better for builtins. import os import inspect -from jedi._compatibility import is_py3, builtins +from jedi._compatibility import is_py3, builtins, unicode from jedi.parser import Parser from jedi.parser import token as token_pr from jedi.parser.representation import Class @@ -31,7 +31,7 @@ def _load_faked_module(module): except IOError: modules[module_name] = None return - module = Parser(source, module_name).module + module = Parser(unicode(source), module_name).module modules[module_name] = module if module_name == 'builtins' and not is_py3: diff --git a/jedi/parser/fast.py b/jedi/parser/fast.py index c3eee157..ced65ce8 100644 --- a/jedi/parser/fast.py +++ b/jedi/parser/fast.py @@ -5,7 +5,7 @@ finished (and still not working as I want), I won't document it any further. """ import re -from jedi._compatibility import use_metaclass +from jedi._compatibility import use_metaclass, unicode from jedi import settings from jedi import common from jedi.parser import Parser @@ -275,7 +275,7 @@ class FastParser(use_metaclass(CachedFastParser)): def _parse(self, code): """ :type code: str """ def empty_parser(): - new, temp = self._get_parser('', '', 0, [], False) + new, temp = self._get_parser(unicode(''), unicode(''), 0, [], False) return new parts = self._split_parts(code) diff --git a/jedi/parser/token.py b/jedi/parser/token.py index 292d4a52..a42b2358 100644 --- a/jedi/parser/token.py +++ b/jedi/parser/token.py @@ -8,7 +8,7 @@ found that a flat object with slots is the best. from inspect import cleandoc from ast import literal_eval -from jedi._compatibility import utf8, unicode +from jedi._compatibility import u, unicode class Token(object): @@ -37,7 +37,7 @@ class Token(object): 4 >>> Token.from_tuple((6, 5, (4, 3))) - >>> unicode(Token(1, utf8("😷"), 1 ,1)) + "p" == utf8("😷p") + >>> unicode(Token(1, u("😷"), 1 ,1)) + "p" == u("😷p") True """ __slots__ = ("_token_type", "_token", "_start_pos_line", "_start_pos_col") @@ -166,4 +166,4 @@ class TokenDocstring(TokenNoCompat): def as_string(self): """Returns a literal cleaned version of the token""" - return cleandoc(literal_eval(self.token)) + return unicode(cleandoc(literal_eval(self.token))) diff --git a/jedi/parser/tokenize.py b/jedi/parser/tokenize.py index 7c155498..a8f3879e 100644 --- a/jedi/parser/tokenize.py +++ b/jedi/parser/tokenize.py @@ -12,7 +12,8 @@ from __future__ import absolute_import import string import re from io import StringIO -from token import * +from token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, NAME, OP, + ERRORTOKEN, NEWLINE) import collections cookie_re = re.compile("coding[:=]\s*([-\w.]+)") @@ -23,9 +24,8 @@ namechars = string.ascii_letters + '_' COMMENT = N_TOKENS tok_name[COMMENT] = 'COMMENT' -ENCODING = N_TOKENS + 2 +ENCODING = N_TOKENS + 1 tok_name[ENCODING] = 'ENCODING' -N_TOKENS += 3 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end')): @@ -153,7 +153,6 @@ def generate_tokens(readline, line_offset=0): Modified to not care about dedents. """ lnum = line_offset - continued = False numchars = '0123456789' contstr = '' contline = None @@ -161,7 +160,7 @@ def generate_tokens(readline, line_offset=0): line = readline() # readline returns empty if it's finished. See StringIO if not line: if contstr: - yield TokenInfo(ERRORTOKEN, contstr, strstart, (lnum, pos)) + yield TokenInfo(ERRORTOKEN, contstr, contstr_start, (lnum, pos)) break lnum += 1 @@ -171,7 +170,7 @@ def generate_tokens(readline, line_offset=0): endmatch = endprog.match(line) if endmatch: pos = end = endmatch.end(0) - yield TokenInfo(STRING, contstr + line[:end], strstart, (lnum, end)) + yield TokenInfo(STRING, contstr + line[:end], contstr_start, (lnum, end)) contstr = '' contline = None else: @@ -207,7 +206,7 @@ def generate_tokens(readline, line_offset=0): token = line[start:pos] yield TokenInfo(STRING, token, spos, (lnum, pos)) else: - strstart = (lnum, start) # multiple lines + contstr_start = (lnum, start) # multiple lines contstr = line[start:] contline = line break @@ -215,7 +214,7 @@ def generate_tokens(readline, line_offset=0): token[:2] in single_quoted or \ token[:3] in single_quoted: if token[-1] == '\n': # continued string - strstart = (lnum, start) + contstr_start = lnum, start endprog = (endprogs[initial] or endprogs[token[1]] or endprogs[token[2]]) contstr = line[start:] diff --git a/jedi/parser/user_context.py b/jedi/parser/user_context.py index e69a3b7f..cdbb7e70 100644 --- a/jedi/parser/user_context.py +++ b/jedi/parser/user_context.py @@ -3,6 +3,7 @@ import os from jedi import cache from jedi.parser import tokenize +from jedi._compatibility import u from jedi.parser.fast import FastParser from jedi.parser import representation from jedi import debug @@ -70,7 +71,7 @@ class UserContext(object): for token_type, tok, start, end in gen: if is_first: if start != (1, 0): # whitespace is not a path - return '', start_cursor + return u(''), start_cursor is_first = False # print 'tok', token_type, tok, force_point @@ -167,14 +168,14 @@ class UserContext(object): self._line_cache = self.source.splitlines() if self.source: if self.source[-1] == '\n': - self._line_cache.append('') + self._line_cache.append(u('')) else: # ''.splitlines() == [] - self._line_cache = [''] + self._line_cache = [u('')] if line_nr == 0: # This is a fix for the zeroth line. We need a newline there, for # the backwards parser. - return '' + return u('') if line_nr < 0: raise StopIteration() try: diff --git a/scripts/wx_check.py b/scripts/wx_check.py index 1e48c39e..5c6e997e 100755 --- a/scripts/wx_check.py +++ b/scripts/wx_check.py @@ -37,6 +37,7 @@ def process_memory(): uri = 'http://svn.wxwidgets.org/viewvc/wx/wxPython/trunk/src/gtk/_core.py?revision=74740&content-type=text%2Fplain&view=co' wx_core = urllib2.urlopen(uri).read() +wx_core = wx_core[:1] def run(): @@ -44,6 +45,7 @@ def run(): print('Process Memory before: %skB' % process_memory()) # After this the module should be cached. # Need to invent a path so that it's really cached. + print type(wx_core), wx_core jedi.Script(wx_core, path='foobar.py').completions() gc.collect() # make sure that it's all fair and the gc did its job. diff --git a/test/test_api/test_unicode.py b/test/test_api/test_unicode.py index 895be6bd..d6b6cbe1 100644 --- a/test/test_api/test_unicode.py +++ b/test/test_api/test_unicode.py @@ -3,7 +3,7 @@ All character set and unicode related tests. """ from jedi import Script -from jedi._compatibility import utf8, unicode +from jedi._compatibility import u, unicode def test_unicode_script(): @@ -13,12 +13,12 @@ def test_unicode_script(): assert len(completions) assert type(completions[0].description) is unicode - s = utf8("author='öä'; author") + s = u("author='öä'; author") completions = Script(s).completions() x = completions[0].description assert type(x) is unicode - s = utf8("#-*- coding: iso-8859-1 -*-\nauthor='öä'; author") + s = u("#-*- coding: iso-8859-1 -*-\nauthor='öä'; author") s = s.encode('latin-1') completions = Script(s).completions() assert type(completions[0].description) is unicode @@ -26,12 +26,12 @@ def test_unicode_script(): def test_unicode_attribute(): """ github jedi-vim issue #94 """ - s1 = utf8('#-*- coding: utf-8 -*-\nclass Person():\n' - ' name = "e"\n\nPerson().name.') + s1 = u('#-*- coding: utf-8 -*-\nclass Person():\n' + ' name = "e"\n\nPerson().name.') completions1 = Script(s1).completions() assert 'strip' in [c.name for c in completions1] - s2 = utf8('#-*- coding: utf-8 -*-\nclass Person():\n' - ' name = "é"\n\nPerson().name.') + s2 = u('#-*- coding: utf-8 -*-\nclass Person():\n' + ' name = "é"\n\nPerson().name.') completions2 = Script(s2).completions() assert 'strip' in [c.name for c in completions2] @@ -39,9 +39,9 @@ def test_unicode_attribute(): def test_multibyte_script(): """ `jedi.Script` must accept multi-byte string source. """ try: - code = unicode("import datetime; datetime.d") - comment = utf8("# multi-byte comment あいうえおä") - s = (unicode('%s\n%s') % (code, comment)).encode('utf-8') + code = u("import datetime; datetime.d") + comment = u("# multi-byte comment あいうえおä") + s = (u('%s\n%s') % (code, comment)).encode('utf-8') except NameError: pass # python 3 has no unicode method else: diff --git a/test/test_evaluate/test_absolute_import.py b/test/test_evaluate/test_absolute_import.py index f274ac33..932356b2 100644 --- a/test/test_evaluate/test_absolute_import.py +++ b/test/test_evaluate/test_absolute_import.py @@ -3,6 +3,7 @@ Tests ``from __future__ import absolute_import`` (only important for Python 2.X) """ import jedi +from jedi._compatibility import u from jedi.parser import Parser from .. import helpers @@ -11,7 +12,7 @@ def test_explicit_absolute_imports(): """ Detect modules with ``from __future__ import absolute_import``. """ - parser = Parser("from __future__ import absolute_import", "test.py") + parser = Parser(u("from __future__ import absolute_import"), "test.py") assert parser.module.has_explicit_absolute_import @@ -19,7 +20,7 @@ def test_no_explicit_absolute_imports(): """ Detect modules without ``from __future__ import absolute_import``. """ - parser = Parser("1", "test.py") + parser = Parser(u("1"), "test.py") assert not parser.module.has_explicit_absolute_import @@ -28,7 +29,7 @@ def test_dont_break_imports_without_namespaces(): The code checking for ``from __future__ import absolute_import`` shouldn't assume that all imports have non-``None`` namespaces. """ - src = "from __future__ import absolute_import\nimport xyzzy" + src = u("from __future__ import absolute_import\nimport xyzzy") parser = Parser(src, "test.py") assert parser.module.has_explicit_absolute_import