The new tokenizer is more or less working now. Indents are calculated as they should

2026-02-21 06:58:26 +08:00 · 2014-11-27 16:03:58 +01:00
parent c0df7003a5
commit 97516eb26b
3 changed files with 47 additions and 6 deletions
--- a/jedi/parser/init.py
+++ b/jedi/parser/init.py
@@ -131,7 +131,7 @@ class Parser(object):
            p = pgen2.parse.Parser(grammar, self.convert_node, self.convert_leaf,
                                   self.error_recovery)
            tokenizer = tokenizer or tokenize.source_tokens(source)
-            self.module = p.parse(self._tokenize(tokenizer))
+            self.module = p.parse(p.tokenize(self._tokenize(tokenizer)))
        self.module.used_names = self.used_names
        self.module.path = module_path
@@ -247,6 +247,18 @@ class Parser(object):
        stack[start_index:] = []
    def _tokenize(self, tokenizer):
        """
            while first_pos[1] <= self._scope.start_pos[1] \
                    and (token_type == tokenize.NAME or tok_str in ('(', '['))\
                    and self._scope != self.module:
                self._scope.end_pos = first_pos
                self._scope = self._scope.parent
                if isinstance(self._scope, pr.Module) \
                        and not isinstance(self._scope, pr.SubModule):
                    self._scope = self.module
        """
        new_scope = False
        for token in tokenizer:
            typ = token.type
            value = token.value
--- a/jedi/parser/pgen2/parse.py
+++ b/jedi/parser/pgen2/parse.py
@@ -96,9 +96,32 @@ class Parser(object):
        self.stack = [stackentry]
        self.rootnode = None
        self.error_recovery = error_recovery
        indent_errors = []  # TODO generate those.
-    def parse(self, tokens):
+    def tokenize(self, tokenizer):
-        for type, value, prefix, start_pos in tokens:
+        """
        This is not a real tokenizer, but it adds indents. You could hand the
        parse function a normal tokenizer (e.g. the lib2to3 one). But if we use
        the parser stack we are able to do error recovery from wrong indents.
        """
        indents = [0]
        new_line = False
        for type, value, prefix, start_pos in tokenizer:
            if type == token.NEWLINE:
                new_line = True
            elif new_line:
                indent = start_pos[1]
                if indent > indents[-1]:
                    yield token.INDENT, '', '', start_pos
                    indents.append(indent)
                while indent < indents[-1]:
                    yield token.DEDENT, '', '', start_pos
                    indents.pop()
                new_line = False
            yield type, value, prefix, start_pos
    def parse(self, tokenizer):
        for type, value, prefix, start_pos in tokenizer:
            if self.addtoken(type, value, prefix, start_pos):
                break
        else:
--- a/jedi/parser/tokenize.py
+++ b/jedi/parser/tokenize.py
@@ -66,7 +66,7 @@ class Token(object):
        self.value = value
        self._start_pos_line = start_pos[0]
        self._start_pos_col = start_pos[1]
-        self.prefix = whitespace
+        self.prefix = prefix
    def __repr__(self):
        typ = tok_name[self.type]
@@ -228,6 +228,7 @@ def generate_tokens(readline, line_offset=0):
    numchars = '0123456789'
    contstr = ''
    contline = None
    new_line = False
    prefix = ''  # Should never be required, but here for safety
    while True:            # loop over lines in stream
        line = readline()  # readline returns empty when finished. See StringIO
@@ -269,14 +270,19 @@ def generate_tokens(readline, line_offset=0):
            spos = (lnum, start)
            token, initial = line[start:pos], line[start]
            if new_line and initial not in '\r\n#':
                new_line = False
            if (initial in numchars or                      # ordinary number
                    (initial == '.' and token != '.' and token != '...')):
                yield Token(NUMBER, token, spos, prefix)
            elif initial in '\r\n':
-                yield Token(NEWLINE, token, spos, prefix)
+                if not new_line:
                    yield Token(NEWLINE, token, spos, prefix)
                new_line = True
            elif initial == '#':
                assert not token.endswith("\n")
-                yield Token(COMMENT, token, spos, prefix)
+                #yield Token(COMMENT, token, spos, prefix)
            elif token in triple_quoted:
                endprog = endprogs[token]
                endmatch = endprog.match(line, pos)