The tokenizer now includes all newlines and comments in its prefix.

2025-12-08 14:54:47 +08:00 · 2014-12-17 20:11:42 +01:00
parent 9cdf6de206
commit b2e54ca1eb
4 changed files with 24 additions and 13 deletions
--- a/jedi/api/classes.py
+++ b/jedi/api/classes.py
@@ -5,6 +5,7 @@ the interesting information about completion and goto operations.
 """
 import warnings
 from itertools import chain
 import re
 from jedi._compatibility import next, unicode, use_metaclass
 from jedi import settings
@@ -579,7 +580,10 @@ class Definition(use_metaclass(CachedMetaClass, BaseDefinition)):
                d = d.get_code()
            finally:
                first_leaf.prefix = old
-        return d.replace('\n', '').replace('\r', '')
+        # Delete comments:
        d = re.sub('#[^\n]+\n', ' ', d)
        # Delete multi spaces/newlines
        return re.sub('\s+', ' ', d).strip()
    @property
    def desc_with_module(self):
--- a/jedi/parser/tokenize.py
+++ b/jedi/parser/tokenize.py
@@ -14,8 +14,8 @@ from __future__ import absolute_import
 import string
 import re
 from io import StringIO
-from token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, NAME, OP,
+from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER,
-                   ERRORTOKEN, NEWLINE, INDENT, DEDENT)
+                               NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
@@ -147,8 +147,11 @@ def source_tokens(source, line_offset=0):
 def generate_tokens(readline, line_offset=0):
    """
-    The original stdlib Python version with minor modifications.
+    A heavily modified Python standard library tokenizer.
-    Modified to not care about dedents.
+
    Additionally to the default information, yields also the prefix of each
    token. This idea comes from lib2to3. The prefix contains all information
    that is irrelevant for the parser like newlines in parentheses or comments.
    """
    paren_level = 0  # count parentheses
    indents = [0]
@@ -158,6 +161,7 @@ def generate_tokens(readline, line_offset=0):
    contline = None
    new_line = False
    prefix = ''  # Should never be required, but here for safety
    additional_prefix = ''
    while True:            # loop over lines in stream
        line = readline()  # readline returns empty when finished. See StringIO
        if not line:
@@ -192,7 +196,8 @@ def generate_tokens(readline, line_offset=0):
                pos += 1
                continue
-            prefix = pseudomatch.group(1)
+            prefix = pseudomatch.group(1) + additional_prefix
            additional_prefix = ''
            start, pos = pseudomatch.span(2)
            spos = (lnum, start)
            token, initial = line[start:pos], line[start]
@@ -213,10 +218,12 @@ def generate_tokens(readline, line_offset=0):
            elif initial in '\r\n':
                if not new_line and paren_level == 0:
                    yield NEWLINE, token, spos, prefix
                else:
                    additional_prefix = prefix + token
                new_line = True
-            elif initial == '#':
+            elif initial == '#':  # Comments
                assert not token.endswith("\n")
-                #yield Token(COMMENT, token, spos, prefix)
+                additional_prefix = prefix + token
            elif token in triple_quoted:
                endprog = endprogs[token]
                endmatch = endprog.match(line, pos)
--- a/jedi/parser/user_context.py
+++ b/jedi/parser/user_context.py
@@ -59,7 +59,7 @@ class UserContext(object):
            first_line = (tok_str.splitlines() or [''])[0]
            column -= len(first_line)
            # Reverse the token again, so that it is in normal order again.
-            yield typ, tok_str[::-1], (self._line_temp, column), prefix
+            yield typ, tok_str[::-1], (self._line_temp, column), prefix[::-1]
    def _calc_path_until_cursor(self, start_pos):
        """
@@ -122,7 +122,7 @@ class UserContext(object):
                    break
            start_cursor = tok_start_pos
-            string = tok_str + string
+            string = tok_str + prefix + string
            last_type = tok_type
        # Don't need whitespace around a statement.