diff --git a/.travis.yml b/.travis.yml index 16d38fab..29c03f1a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: python env: - - TOXENV=py25 PIP_INSECURE=t - TOXENV=py26 - TOXENV=py27 - TOXENV=py32 diff --git a/jedi/common.py b/jedi/common.py index ebd1f25d..960d71d3 100644 --- a/jedi/common.py +++ b/jedi/common.py @@ -2,10 +2,9 @@ import sys import contextlib import functools -import tokenize +import tokenizer as tokenize from jedi._compatibility import next, reraise -from jedi import debug from jedi import settings @@ -101,28 +100,7 @@ class NoErrorTokenizer(object): def __next__(self): if self.closed: raise MultiLevelStopIteration() - try: - self.current = next(self.gen) - except tokenize.TokenError: - # We just ignore this error, I try to handle it earlier - as - # good as possible - debug.warning('parentheses not closed error') - return self.__next__() - except IndentationError: - # This is an error, that tokenize may produce, because the code - # is not indented as it should. Here it just ignores this line - # and restarts the parser. - # (This is a rather unlikely error message, for normal code, - # tokenize seems to be pretty tolerant) - debug.warning('indentation error on line %s, ignoring it' % - self.current[2][0]) - # add the starting line of the last position - self.offset = (self.offset[0] + self.current[2][0], - self.current[2][1]) - self.gen = PushBackIterator(tokenize.generate_tokens( - self.readline)) - return self.__next__() - + self.current = next(self.gen) c = list(self.current) # stop if a new class or definition is started at position zero. diff --git a/jedi/modules.py b/jedi/modules.py index e67cc2c3..3301eba7 100644 --- a/jedi/modules.py +++ b/jedi/modules.py @@ -16,7 +16,7 @@ Apart from those classes there's a ``sys.path`` fetching function, as well as from __future__ import with_statement import re -import tokenize +import tokenizer as tokenize import sys import os diff --git a/jedi/parsing.py b/jedi/parsing.py index e5279f71..e1785c01 100644 --- a/jedi/parsing.py +++ b/jedi/parsing.py @@ -17,7 +17,7 @@ complexity of the ``Parser`` (there's another parser sitting inside """ from __future__ import with_statement -import tokenize +import tokenizer as tokenize import keyword from jedi._compatibility import next, StringIO @@ -394,6 +394,7 @@ class Parser(object): stmt = stmt_class(self.module, set_vars, used_vars, tok_list, first_pos, self.end_pos) + stmt.parent = self.top_module self._check_user_stmt(stmt) # Attribute docstring (PEP 257) support diff --git a/jedi/parsing_representation.py b/jedi/parsing_representation.py index d4b8380c..a7dc9545 100644 --- a/jedi/parsing_representation.py +++ b/jedi/parsing_representation.py @@ -38,7 +38,7 @@ from __future__ import with_statement import os import re -import tokenize +import tokenizer as tokenize from jedi._compatibility import next, literal_eval, cleandoc, Python3Method, \ encoding, property, unicode, is_py3k diff --git a/jedi/tokenizer.py b/jedi/tokenizer.py new file mode 100644 index 00000000..e4b5c0d6 --- /dev/null +++ b/jedi/tokenizer.py @@ -0,0 +1,270 @@ +""" +This tokenizer has been copied from the ``tokenize.py`` standard library +tokenizer. The reason was simple: The standanrd library tokenizer fails +if the indentation is not right. The fast parser of jedi however requires +"wrong" indentation. + +Basically this is a stripped down version of the standard library module, so +you can read the documentation there. +""" + +import string +import re +from token import * +from codecs import lookup, BOM_UTF8 +import collections +cookie_re = re.compile("coding[:=]\s*([-\w.]+)") + +namechars = string.ascii_letters + '_' + + +COMMENT = N_TOKENS +tok_name[COMMENT] = 'COMMENT' +NL = N_TOKENS + 1 +tok_name[NL] = 'NL' +ENCODING = N_TOKENS + 2 +tok_name[ENCODING] = 'ENCODING' +N_TOKENS += 3 + +class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): + def __repr__(self): + annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) + return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % + self._replace(type=annotated_type)) + +def group(*choices): return '(' + '|'.join(choices) + ')' +def any(*choices): return group(*choices) + '*' +def maybe(*choices): return group(*choices) + '?' + +# Note: we use unicode matching for names ("\w") but ascii matching for +# number literals. +Whitespace = r'[ \f\t]*' +Comment = r'#[^\r\n]*' +Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) +Name = r'\w+' + +Hexnumber = r'0[xX][0-9a-fA-F]+' +Binnumber = r'0[bB][01]+' +Octnumber = r'0[oO][0-7]+' +Decnumber = r'(?:0+|[1-9][0-9]*)' +Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) +Exponent = r'[eE][-+]?[0-9]+' +Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) +Expfloat = r'[0-9]+' + Exponent +Floatnumber = group(Pointfloat, Expfloat) +Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') +Number = group(Imagnumber, Floatnumber, Intnumber) + +# Tail end of ' string. +Single = r"[^'\\]*(?:\\.[^'\\]*)*'" +# Tail end of " string. +Double = r'[^"\\]*(?:\\.[^"\\]*)*"' +# Tail end of ''' string. +Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" +# Tail end of """ string. +Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' +Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""') +# Single-line ' or " string. +String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", + r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') + +# Because of leftmost-then-longest match semantics, be sure to put the +# longest operators first (e.g., if = came before ==, == would get +# recognized as two instances of =). +Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", + r"//=?", r"->", + r"[+\-*/%&|^=<>]=?", + r"~") + +Bracket = '[][(){}]' +Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') +Funny = group(Operator, Bracket, Special) + +PlainToken = group(Number, Funny, String, Name) +Token = Ignore + PlainToken + +# First (or only) line of ' or " string. +ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + + group("'", r'\\\r?\n'), + r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + + group('"', r'\\\r?\n')) +PseudoExtras = group(r'\\\r?\n', Comment, Triple) +PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) + +def _compile(expr): + return re.compile(expr, re.UNICODE) + +tokenprog, pseudoprog, single3prog, double3prog = map( + _compile, (Token, PseudoToken, Single3, Double3)) +endprogs = {"'": _compile(Single), '"': _compile(Double), + "'''": single3prog, '"""': double3prog, + "r'''": single3prog, 'r"""': double3prog, + "b'''": single3prog, 'b"""': double3prog, + "br'''": single3prog, 'br"""': double3prog, + "R'''": single3prog, 'R"""': double3prog, + "B'''": single3prog, 'B"""': double3prog, + "bR'''": single3prog, 'bR"""': double3prog, + "Br'''": single3prog, 'Br"""': double3prog, + "BR'''": single3prog, 'BR"""': double3prog, + 'r': None, 'R': None, 'b': None, 'B': None} + +triple_quoted = {} +for t in ("'''", '"""', + "r'''", 'r"""', "R'''", 'R"""', + "b'''", 'b"""', "B'''", 'B"""', + "br'''", 'br"""', "Br'''", 'Br"""', + "bR'''", 'bR"""', "BR'''", 'BR"""'): + triple_quoted[t] = t +single_quoted = {} +for t in ("'", '"', + "r'", 'r"', "R'", 'R"', + "b'", 'b"', "B'", 'B"', + "br'", 'br"', "Br'", 'Br"', + "bR'", 'bR"', "BR'", 'BR"' ): + single_quoted[t] = t + +del _compile + +tabsize = 8 + +class TokenError(Exception): pass + + +def generate_tokens(readline): + lnum = parenlev = continued = 0 + numchars = '0123456789' + contstr, needcont = '', 0 + contline = None + indents = [0] + + while True: # loop over lines in stream + try: + line = readline() + except StopIteration: + line = b'' + + lnum += 1 + pos, max = 0, len(line) + + if contstr: # continued string + if not line: + # multiline string has not been finished + break + endmatch = endprog.match(line) + if endmatch: + pos = end = endmatch.end(0) + yield TokenInfo(STRING, contstr + line[:end], + strstart, (lnum, end), contline + line) + contstr, needcont = '', 0 + contline = None + elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': + yield TokenInfo(ERRORTOKEN, contstr + line, + strstart, (lnum, len(line)), contline) + contstr = '' + contline = None + continue + else: + contstr = contstr + line + contline = contline + line + continue + + elif parenlev == 0 and not continued: # new statement + if not line: break + column = 0 + while pos < max: # measure leading whitespace + if line[pos] == ' ': + column += 1 + elif line[pos] == '\t': + column = (column//tabsize + 1)*tabsize + elif line[pos] == '\f': + column = 0 + else: + break + pos += 1 + if pos == max: + break + + if line[pos] in '#\r\n': # skip comments or blank lines + if line[pos] == '#': + comment_token = line[pos:].rstrip('\r\n') + nl_pos = pos + len(comment_token) + yield TokenInfo(COMMENT, comment_token, + (lnum, pos), (lnum, pos + len(comment_token)), line) + yield TokenInfo(NL, line[nl_pos:], + (lnum, nl_pos), (lnum, len(line)), line) + else: + yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:], + (lnum, pos), (lnum, len(line)), line) + continue + + if column > indents[-1]: # count indents or dedents + indents.append(column) + yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) + while column < indents[-1]: + indents = indents[:-1] + yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) + + else: # continued statement + if not line: + # basically a statement has not been finished here. + break + continued = 0 + + while pos < max: + pseudomatch = pseudoprog.match(line, pos) + if pseudomatch: # scan for tokens + start, end = pseudomatch.span(1) + spos, epos, pos = (lnum, start), (lnum, end), end + token, initial = line[start:end], line[start] + + if (initial in numchars or # ordinary number + (initial == '.' and token != '.' and token != '...')): + yield TokenInfo(NUMBER, token, spos, epos, line) + elif initial in '\r\n': + yield TokenInfo(NL if parenlev > 0 else NEWLINE, + token, spos, epos, line) + elif initial == '#': + assert not token.endswith("\n") + yield TokenInfo(COMMENT, token, spos, epos, line) + elif token in triple_quoted: + endprog = endprogs[token] + endmatch = endprog.match(line, pos) + if endmatch: # all on one line + pos = endmatch.end(0) + token = line[start:pos] + yield TokenInfo(STRING, token, spos, (lnum, pos), line) + else: + strstart = (lnum, start) # multiple lines + contstr = line[start:] + contline = line + break + elif initial in single_quoted or \ + token[:2] in single_quoted or \ + token[:3] in single_quoted: + if token[-1] == '\n': # continued string + strstart = (lnum, start) + endprog = (endprogs[initial] or endprogs[token[1]] or + endprogs[token[2]]) + contstr, needcont = line[start:], 1 + contline = line + break + else: # ordinary string + yield TokenInfo(STRING, token, spos, epos, line) + elif initial in namechars: # ordinary name + yield TokenInfo(NAME, token, spos, epos, line) + elif initial == '\\': # continued stmt + continued = 1 + else: + if initial in '([{': + parenlev += 1 + elif initial in ')]}': + parenlev -= 1 + yield TokenInfo(OP, token, spos, epos, line) + else: + yield TokenInfo(ERRORTOKEN, line[pos], + (lnum, pos), (lnum, pos + 1), line) + pos += 1 + + for indent in indents[1:]: # pop remaining indent levels + yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') + yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')