removed unnecessary stuff for the jedi tokenizer

2013-04-19 23:59:33 +04:30
parent a28bc7195f
commit 3b0f4b87cf
1 changed files with 8 additions and 156 deletions
--- a/jedi/tokenizer.py
+++ b/jedi/tokenizer.py
@@ -1,4 +1,11 @@
-"""Tokenization help for Python programs.
+"""
+This tokenizer has been copied from the ``tokenize.py`` standard library
+tokenizer. The reason was simple: The standanrd library  tokenizer fails
+if the indentation is not right. The fast parser of jedi however requires
+"wrong" indentation.
+
+Tokenization help for Python programs.
++++++++++++++++++++++++++++++++++++++

 tokenize(readline) is a generator that breaks a stream of bytes into
 Python tokens.  It decodes the bytes according to PEP-0263 for
@@ -20,23 +27,12 @@ operators.  Additionally, all token lists start with an ENCODING token
 which tells you which encoding was used to decode the bytes stream.
 """

-__author__ = 'Ka-Ping Yee <ping@lfw.org>'
-__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
-               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
-               'Michael Foord')
-import builtins
 import re
-import sys
 from token import *
 from codecs import lookup, BOM_UTF8
 import collections
-from io import TextIOWrapper
 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

-import token
-__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
-                           "NL", "untokenize", "ENCODING", "TokenInfo"]
-del token

 COMMENT = N_TOKENS
 tok_name[COMMENT] = 'COMMENT'
@@ -152,107 +148,6 @@ class TokenError(Exception): pass
 class StopTokenizing(Exception): pass


-class Untokenizer:
-
-    def __init__(self):
-        self.tokens = []
-        self.prev_row = 1
-        self.prev_col = 0
-        self.encoding = None
-
-    def add_whitespace(self, start):
-        row, col = start
-        assert row <= self.prev_row
-        col_offset = col - self.prev_col
-        if col_offset:
-            self.tokens.append(" " * col_offset)
-
-    def untokenize(self, iterable):
-        for t in iterable:
-            if len(t) == 2:
-                self.compat(t, iterable)
-                break
-            tok_type, token, start, end, line = t
-            if tok_type == ENCODING:
-                self.encoding = token
-                continue
-            self.add_whitespace(start)
-            self.tokens.append(token)
-            self.prev_row, self.prev_col = end
-            if tok_type in (NEWLINE, NL):
-                self.prev_row += 1
-                self.prev_col = 0
-        return "".join(self.tokens)
-
-    def compat(self, token, iterable):
-        startline = False
-        indents = []
-        toks_append = self.tokens.append
-        toknum, tokval = token
-
-        if toknum in (NAME, NUMBER):
-            tokval += ' '
-        if toknum in (NEWLINE, NL):
-            startline = True
-        prevstring = False
-        for tok in iterable:
-            toknum, tokval = tok[:2]
-            if toknum == ENCODING:
-                self.encoding = tokval
-                continue
-
-            if toknum in (NAME, NUMBER):
-                tokval += ' '
-
-            # Insert a space between two consecutive strings
-            if toknum == STRING:
-                if prevstring:
-                    tokval = ' ' + tokval
-                prevstring = True
-            else:
-                prevstring = False
-
-            if toknum == INDENT:
-                indents.append(tokval)
-                continue
-            elif toknum == DEDENT:
-                indents.pop()
-                continue
-            elif toknum in (NEWLINE, NL):
-                startline = True
-            elif startline and indents:
-                toks_append(indents[-1])
-                startline = False
-            toks_append(tokval)
-
-
-def untokenize(iterable):
-    """Transform tokens back into Python source code.
-    It returns a bytes object, encoded using the ENCODING
-    token, which is the first token sequence output by tokenize.
-
-    Each element returned by the iterable must be a token sequence
-    with at least two elements, a token number and token value.  If
-    only two tokens are passed, the resulting output is poor.
-
-    Round-trip invariant for full input:
-        Untokenized source will match input source exactly
-
-    Round-trip invariant for limited intput:
-        # Output bytes will tokenize the back to the input
-        t1 = [tok[:2] for tok in tokenize(f.readline)]
-        newcode = untokenize(t1)
-        readline = BytesIO(newcode).readline
-        t2 = [tok[:2] for tok in tokenize(readline)]
-        assert t1 == t2
-    """
-    ut = Untokenizer()
-    out = ut.untokenize(iterable)
-    if ut.encoding is not None:
-        out = out.encode(ut.encoding)
-    return out
-
-
 def _get_normal_name(orig_enc):
    """Imitates get_normal_name in tokenizer.c."""
    # Only care about the first 12 characters.
@@ -336,18 +231,6 @@ def detect_encoding(readline):
    return default, [first, second]


-def open(filename):
-    """Open a file in read only mode using the encoding detected by
-    detect_encoding().
-    """
-    buffer = builtins.open(filename, 'rb')
-    encoding, lines = detect_encoding(buffer.readline)
-    buffer.seek(0)
-    text = TextIOWrapper(buffer, encoding, line_buffering=True)
-    text.mode = 'r'
-    return text
-
-
 def tokenize(readline):
    """
    The tokenize() generator requires one argment, readline, which
@@ -523,34 +406,3 @@ def _tokenize(readline, encoding):
    for indent in indents[1:]:                 # pop remaining indent levels
        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
-
-
-# An undocumented, backwards compatible, API for all the places in the standard
-# library that expect to be able to use tokenize with strings
-def generate_tokens(readline):
-    return _tokenize(readline, None)
-
-if __name__ == "__main__":
-    # Quick sanity check
-    s = b'''def parseline(self, line):
-            """Parse the line into a command name and a string containing
-            the arguments.  Returns a tuple containing (command, args, line).
-            'command' and 'args' may be None if the line couldn't be parsed.
-            """
-            line = line.strip()
-            if not line:
-                return None, None, line
-            elif line[0] == '?':
-                line = 'help ' + line[1:]
-            elif line[0] == '!':
-                if hasattr(self, 'do_shell'):
-                    line = 'shell ' + line[1:]
-                else:
-                    return None, None, line
-            i, n = 0, len(line)
-            while i < n and line[i] in self.identchars: i = i+1
-            cmd, arg = line[:i], line[i:].strip()
-            return cmd, arg, line
-    '''
-    for tok in tokenize(iter(s.splitlines()).__next__):
-        print(tok)