1
0
forked from VimPlug/jedi
Files
jedi-fork/jedi/tokenizer.py
2013-04-20 09:23:27 +04:30

305 lines
12 KiB
Python

"""
This tokenizer has been copied from the ``tokenize.py`` standard library
tokenizer. The reason was simple: The standanrd library tokenizer fails
if the indentation is not right. The fast parser of jedi however requires
"wrong" indentation.
Tokenization help for Python programs.
++++++++++++++++++++++++++++++++++++++
tokenize(readline) is a generator that breaks a stream of bytes into
Python tokens. It decodes the bytes according to PEP-0263 for
determining source file encoding.
It accepts a readline-like method which is called repeatedly to get the
next line of input (or b"" for EOF). It generates 5-tuples with these
members:
the token type (see token.py)
the token (a string)
the starting (row, column) indices of the token (a 2-tuple of ints)
the ending (row, column) indices of the token (a 2-tuple of ints)
the original line (string)
It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
operators. Additionally, all token lists start with an ENCODING token
which tells you which encoding was used to decode the bytes stream.
"""
import string
import re
from token import *
from codecs import lookup, BOM_UTF8
import collections
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
namechars = string.ascii_letters + '_'
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
NL = N_TOKENS + 1
tok_name[NL] = 'NL'
ENCODING = N_TOKENS + 2
tok_name[ENCODING] = 'ENCODING'
N_TOKENS += 3
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
def __repr__(self):
annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
self._replace(type=annotated_type))
def group(*choices): return '(' + '|'.join(choices) + ')'
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
# Note: we use unicode matching for names ("\w") but ascii matching for
# number literals.
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'\w+'
Hexnumber = r'0[xX][0-9a-fA-F]+'
Binnumber = r'0[bB][01]+'
Octnumber = r'0[oO][0-7]+'
Decnumber = r'(?:0+|[1-9][0-9]*)'
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Exponent = r'[eE][-+]?[0-9]+'
Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
Expfloat = r'[0-9]+' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber)
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
# Single-line ' or " string.
String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
r"//=?", r"->",
r"[+\-*/%&|^=<>]=?",
r"~")
Bracket = '[][(){}]'
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Funny = group(Operator, Bracket, Special)
PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken
# First (or only) line of ' or " string.
ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
group("'", r'\\\r?\n'),
r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n'))
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
def _compile(expr):
return re.compile(expr, re.UNICODE)
tokenprog, pseudoprog, single3prog, double3prog = map(
_compile, (Token, PseudoToken, Single3, Double3))
endprogs = {"'": _compile(Single), '"': _compile(Double),
"'''": single3prog, '"""': double3prog,
"r'''": single3prog, 'r"""': double3prog,
"b'''": single3prog, 'b"""': double3prog,
"br'''": single3prog, 'br"""': double3prog,
"R'''": single3prog, 'R"""': double3prog,
"B'''": single3prog, 'B"""': double3prog,
"bR'''": single3prog, 'bR"""': double3prog,
"Br'''": single3prog, 'Br"""': double3prog,
"BR'''": single3prog, 'BR"""': double3prog,
'r': None, 'R': None, 'b': None, 'B': None}
triple_quoted = {}
for t in ("'''", '"""',
"r'''", 'r"""', "R'''", 'R"""',
"b'''", 'b"""', "B'''", 'B"""',
"br'''", 'br"""', "Br'''", 'Br"""',
"bR'''", 'bR"""', "BR'''", 'BR"""'):
triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
"r'", 'r"', "R'", 'R"',
"b'", 'b"', "B'", 'B"',
"br'", 'br"', "Br'", 'Br"',
"bR'", 'bR"', "BR'", 'BR"' ):
single_quoted[t] = t
del _compile
tabsize = 8
class TokenError(Exception): pass
def _tokenize(readline, encoding):
lnum = parenlev = continued = 0
numchars = '0123456789'
contstr, needcont = '', 0
contline = None
indents = [0]
if encoding is not None:
if encoding == "utf-8-sig":
# BOM will already have been stripped.
encoding = "utf-8"
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
while True: # loop over lines in stream
try:
line = readline()
except StopIteration:
line = b''
if encoding is not None:
line = line.decode(encoding)
lnum += 1
pos, max = 0, len(line)
if contstr: # continued string
if not line:
raise TokenError("EOF in multi-line string", strstart)
endmatch = endprog.match(line)
if endmatch:
pos = end = endmatch.end(0)
yield TokenInfo(STRING, contstr + line[:end],
strstart, (lnum, end), contline + line)
contstr, needcont = '', 0
contline = None
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
yield TokenInfo(ERRORTOKEN, contstr + line,
strstart, (lnum, len(line)), contline)
contstr = ''
contline = None
continue
else:
contstr = contstr + line
contline = contline + line
continue
elif parenlev == 0 and not continued: # new statement
if not line: break
column = 0
while pos < max: # measure leading whitespace
if line[pos] == ' ':
column += 1
elif line[pos] == '\t':
column = (column//tabsize + 1)*tabsize
elif line[pos] == '\f':
column = 0
else:
break
pos += 1
if pos == max:
break
if line[pos] in '#\r\n': # skip comments or blank lines
if line[pos] == '#':
comment_token = line[pos:].rstrip('\r\n')
nl_pos = pos + len(comment_token)
yield TokenInfo(COMMENT, comment_token,
(lnum, pos), (lnum, pos + len(comment_token)), line)
yield TokenInfo(NL, line[nl_pos:],
(lnum, nl_pos), (lnum, len(line)), line)
else:
yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
(lnum, pos), (lnum, len(line)), line)
continue
if column > indents[-1]: # count indents or dedents
indents.append(column)
yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
while column < indents[-1]:
if column not in indents:
raise IndentationError(
"unindent does not match any outer indentation level",
("<tokenize>", lnum, pos, line))
indents = indents[:-1]
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
else: # continued statement
if not line:
raise TokenError("EOF in multi-line statement", (lnum, 0))
continued = 0
while pos < max:
pseudomatch = pseudoprog.match(line, pos)
if pseudomatch: # scan for tokens
start, end = pseudomatch.span(1)
spos, epos, pos = (lnum, start), (lnum, end), end
token, initial = line[start:end], line[start]
if (initial in numchars or # ordinary number
(initial == '.' and token != '.' and token != '...')):
yield TokenInfo(NUMBER, token, spos, epos, line)
elif initial in '\r\n':
yield TokenInfo(NL if parenlev > 0 else NEWLINE,
token, spos, epos, line)
elif initial == '#':
assert not token.endswith("\n")
yield TokenInfo(COMMENT, token, spos, epos, line)
elif token in triple_quoted:
endprog = endprogs[token]
endmatch = endprog.match(line, pos)
if endmatch: # all on one line
pos = endmatch.end(0)
token = line[start:pos]
yield TokenInfo(STRING, token, spos, (lnum, pos), line)
else:
strstart = (lnum, start) # multiple lines
contstr = line[start:]
contline = line
break
elif initial in single_quoted or \
token[:2] in single_quoted or \
token[:3] in single_quoted:
if token[-1] == '\n': # continued string
strstart = (lnum, start)
endprog = (endprogs[initial] or endprogs[token[1]] or
endprogs[token[2]])
contstr, needcont = line[start:], 1
contline = line
break
else: # ordinary string
yield TokenInfo(STRING, token, spos, epos, line)
elif initial in namechars: # ordinary name
yield TokenInfo(NAME, token, spos, epos, line)
elif initial == '\\': # continued stmt
continued = 1
else:
if initial in '([{':
parenlev += 1
elif initial in ')]}':
parenlev -= 1
yield TokenInfo(OP, token, spos, epos, line)
else:
yield TokenInfo(ERRORTOKEN, line[pos],
(lnum, pos), (lnum, pos + 1), line)
pos += 1
for indent in indents[1:]: # pop remaining indent levels
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
# An undocumented, backwards compatible, API for all the places in the standard
# library that expect to be able to use tokenize with strings
def generate_tokens(readline):
return _tokenize(readline, None)