1
0
forked from VimPlug/jedi

removed unnecessary stuff for the jedi tokenizer

This commit is contained in:
David Halter
2013-04-19 23:59:33 +04:30
parent a28bc7195f
commit 3b0f4b87cf

View File

@@ -1,4 +1,11 @@
"""Tokenization help for Python programs.
"""
This tokenizer has been copied from the ``tokenize.py`` standard library
tokenizer. The reason was simple: The standanrd library tokenizer fails
if the indentation is not right. The fast parser of jedi however requires
"wrong" indentation.
Tokenization help for Python programs.
++++++++++++++++++++++++++++++++++++++
tokenize(readline) is a generator that breaks a stream of bytes into
Python tokens. It decodes the bytes according to PEP-0263 for
@@ -20,23 +27,12 @@ operators. Additionally, all token lists start with an ENCODING token
which tells you which encoding was used to decode the bytes stream.
"""
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
'Michael Foord')
import builtins
import re
import sys
from token import *
from codecs import lookup, BOM_UTF8
import collections
from io import TextIOWrapper
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
import token
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
"NL", "untokenize", "ENCODING", "TokenInfo"]
del token
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
@@ -152,107 +148,6 @@ class TokenError(Exception): pass
class StopTokenizing(Exception): pass
class Untokenizer:
def __init__(self):
self.tokens = []
self.prev_row = 1
self.prev_col = 0
self.encoding = None
def add_whitespace(self, start):
row, col = start
assert row <= self.prev_row
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
def untokenize(self, iterable):
for t in iterable:
if len(t) == 2:
self.compat(t, iterable)
break
tok_type, token, start, end, line = t
if tok_type == ENCODING:
self.encoding = token
continue
self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
if tok_type in (NEWLINE, NL):
self.prev_row += 1
self.prev_col = 0
return "".join(self.tokens)
def compat(self, token, iterable):
startline = False
indents = []
toks_append = self.tokens.append
toknum, tokval = token
if toknum in (NAME, NUMBER):
tokval += ' '
if toknum in (NEWLINE, NL):
startline = True
prevstring = False
for tok in iterable:
toknum, tokval = tok[:2]
if toknum == ENCODING:
self.encoding = tokval
continue
if toknum in (NAME, NUMBER):
tokval += ' '
# Insert a space between two consecutive strings
if toknum == STRING:
if prevstring:
tokval = ' ' + tokval
prevstring = True
else:
prevstring = False
if toknum == INDENT:
indents.append(tokval)
continue
elif toknum == DEDENT:
indents.pop()
continue
elif toknum in (NEWLINE, NL):
startline = True
elif startline and indents:
toks_append(indents[-1])
startline = False
toks_append(tokval)
def untokenize(iterable):
"""Transform tokens back into Python source code.
It returns a bytes object, encoded using the ENCODING
token, which is the first token sequence output by tokenize.
Each element returned by the iterable must be a token sequence
with at least two elements, a token number and token value. If
only two tokens are passed, the resulting output is poor.
Round-trip invariant for full input:
Untokenized source will match input source exactly
Round-trip invariant for limited intput:
# Output bytes will tokenize the back to the input
t1 = [tok[:2] for tok in tokenize(f.readline)]
newcode = untokenize(t1)
readline = BytesIO(newcode).readline
t2 = [tok[:2] for tok in tokenize(readline)]
assert t1 == t2
"""
ut = Untokenizer()
out = ut.untokenize(iterable)
if ut.encoding is not None:
out = out.encode(ut.encoding)
return out
def _get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
@@ -336,18 +231,6 @@ def detect_encoding(readline):
return default, [first, second]
def open(filename):
"""Open a file in read only mode using the encoding detected by
detect_encoding().
"""
buffer = builtins.open(filename, 'rb')
encoding, lines = detect_encoding(buffer.readline)
buffer.seek(0)
text = TextIOWrapper(buffer, encoding, line_buffering=True)
text.mode = 'r'
return text
def tokenize(readline):
"""
The tokenize() generator requires one argment, readline, which
@@ -523,34 +406,3 @@ def _tokenize(readline, encoding):
for indent in indents[1:]: # pop remaining indent levels
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
# An undocumented, backwards compatible, API for all the places in the standard
# library that expect to be able to use tokenize with strings
def generate_tokens(readline):
return _tokenize(readline, None)
if __name__ == "__main__":
# Quick sanity check
s = b'''def parseline(self, line):
"""Parse the line into a command name and a string containing
the arguments. Returns a tuple containing (command, args, line).
'command' and 'args' may be None if the line couldn't be parsed.
"""
line = line.strip()
if not line:
return None, None, line
elif line[0] == '?':
line = 'help ' + line[1:]
elif line[0] == '!':
if hasattr(self, 'do_shell'):
line = 'shell ' + line[1:]
else:
return None, None, line
i, n = 0, len(line)
while i < n and line[i] in self.identchars: i = i+1
cmd, arg = line[:i], line[i:].strip()
return cmd, arg, line
'''
for tok in tokenize(iter(s.splitlines()).__next__):
print(tok)