From 0d385563a5d1bb1db3e20e8fb6bec6ce72099d19 Mon Sep 17 00:00:00 2001 From: David Halter Date: Sat, 20 Apr 2013 09:28:32 +0430 Subject: [PATCH] tokenizer: removed encoding --- jedi/tokenizer.py | 38 +++----------------------------------- 1 file changed, 3 insertions(+), 35 deletions(-) diff --git a/jedi/tokenizer.py b/jedi/tokenizer.py index c5ff5e2f..e48cf30b 100644 --- a/jedi/tokenizer.py +++ b/jedi/tokenizer.py @@ -4,27 +4,8 @@ tokenizer. The reason was simple: The standanrd library tokenizer fails if the indentation is not right. The fast parser of jedi however requires "wrong" indentation. -Tokenization help for Python programs. -++++++++++++++++++++++++++++++++++++++ - -tokenize(readline) is a generator that breaks a stream of bytes into -Python tokens. It decodes the bytes according to PEP-0263 for -determining source file encoding. - -It accepts a readline-like method which is called repeatedly to get the -next line of input (or b"" for EOF). It generates 5-tuples with these -members: - - the token type (see token.py) - the token (a string) - the starting (row, column) indices of the token (a 2-tuple of ints) - the ending (row, column) indices of the token (a 2-tuple of ints) - the original line (string) - -It is designed to match the working of the Python tokenizer exactly, except -that it produces COMMENT tokens for comments and gives type OP for all -operators. Additionally, all token lists start with an ENCODING token -which tells you which encoding was used to decode the bytes stream. +Basically this is a stripped down version of the standard library module, so +you can read the documentation there. """ import string @@ -149,26 +130,19 @@ tabsize = 8 class TokenError(Exception): pass -def _tokenize(readline, encoding): +def generate_tokens(readline): lnum = parenlev = continued = 0 numchars = '0123456789' contstr, needcont = '', 0 contline = None indents = [0] - if encoding is not None: - if encoding == "utf-8-sig": - # BOM will already have been stripped. - encoding = "utf-8" - yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') while True: # loop over lines in stream try: line = readline() except StopIteration: line = b'' - if encoding is not None: - line = line.decode(encoding) lnum += 1 pos, max = 0, len(line) @@ -296,9 +270,3 @@ def _tokenize(readline, encoding): for indent in indents[1:]: # pop remaining indent levels yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') - - -# An undocumented, backwards compatible, API for all the places in the standard -# library that expect to be able to use tokenize with strings -def generate_tokens(readline): - return _tokenize(readline, None)