1
0
forked from VimPlug/jedi

tokenizer: removed encoding

This commit is contained in:
David Halter
2013-04-20 09:28:32 +04:30
parent 6b2619a844
commit 0d385563a5

View File

@@ -4,27 +4,8 @@ tokenizer. The reason was simple: The standanrd library tokenizer fails
if the indentation is not right. The fast parser of jedi however requires if the indentation is not right. The fast parser of jedi however requires
"wrong" indentation. "wrong" indentation.
Tokenization help for Python programs. Basically this is a stripped down version of the standard library module, so
++++++++++++++++++++++++++++++++++++++ you can read the documentation there.
tokenize(readline) is a generator that breaks a stream of bytes into
Python tokens. It decodes the bytes according to PEP-0263 for
determining source file encoding.
It accepts a readline-like method which is called repeatedly to get the
next line of input (or b"" for EOF). It generates 5-tuples with these
members:
the token type (see token.py)
the token (a string)
the starting (row, column) indices of the token (a 2-tuple of ints)
the ending (row, column) indices of the token (a 2-tuple of ints)
the original line (string)
It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
operators. Additionally, all token lists start with an ENCODING token
which tells you which encoding was used to decode the bytes stream.
""" """
import string import string
@@ -149,26 +130,19 @@ tabsize = 8
class TokenError(Exception): pass class TokenError(Exception): pass
def _tokenize(readline, encoding): def generate_tokens(readline):
lnum = parenlev = continued = 0 lnum = parenlev = continued = 0
numchars = '0123456789' numchars = '0123456789'
contstr, needcont = '', 0 contstr, needcont = '', 0
contline = None contline = None
indents = [0] indents = [0]
if encoding is not None:
if encoding == "utf-8-sig":
# BOM will already have been stripped.
encoding = "utf-8"
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
while True: # loop over lines in stream while True: # loop over lines in stream
try: try:
line = readline() line = readline()
except StopIteration: except StopIteration:
line = b'' line = b''
if encoding is not None:
line = line.decode(encoding)
lnum += 1 lnum += 1
pos, max = 0, len(line) pos, max = 0, len(line)
@@ -296,9 +270,3 @@ def _tokenize(readline, encoding):
for indent in indents[1:]: # pop remaining indent levels for indent in indents[1:]: # pop remaining indent levels
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
# An undocumented, backwards compatible, API for all the places in the standard
# library that expect to be able to use tokenize with strings
def generate_tokens(readline):
return _tokenize(readline, None)