The tokenizer now includes all newlines and comments in its prefix.

2025-12-08 06:44:46 +08:00 · 2014-12-17 20:11:42 +01:00
parent 9cdf6de206
commit b2e54ca1eb
4 changed files with 24 additions and 13 deletions
--- a/jedi/api/classes.py
+++ b/jedi/api/classes.py
@@ -5,6 +5,7 @@ the interesting information about completion and goto operations.
 """
 import warnings
 from itertools import chain
+import re

 from jedi._compatibility import next, unicode, use_metaclass
 from jedi import settings
@@ -579,7 +580,10 @@ class Definition(use_metaclass(CachedMetaClass, BaseDefinition)):
                d = d.get_code()
            finally:
                first_leaf.prefix = old
-        return d.replace('\n', '').replace('\r', '')
+        # Delete comments:
+        d = re.sub('#[^\n]+\n', ' ', d)
+        # Delete multi spaces/newlines
+        return re.sub('\s+', ' ', d).strip()

    @property
    def desc_with_module(self):
--- a/jedi/parser/tokenize.py
+++ b/jedi/parser/tokenize.py
@@ -14,8 +14,8 @@ from __future__ import absolute_import
 import string
 import re
 from io import StringIO
-from token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, NAME, OP,
-                   ERRORTOKEN, NEWLINE, INDENT, DEDENT)
+from jedi.parser.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER,
+                               NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)


 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
@@ -147,8 +147,11 @@ def source_tokens(source, line_offset=0):

 def generate_tokens(readline, line_offset=0):
    """
-    The original stdlib Python version with minor modifications.
-    Modified to not care about dedents.
+    A heavily modified Python standard library tokenizer.
+
+    Additionally to the default information, yields also the prefix of each
+    token. This idea comes from lib2to3. The prefix contains all information
+    that is irrelevant for the parser like newlines in parentheses or comments.
    """
    paren_level = 0  # count parentheses
    indents = [0]
@@ -158,6 +161,7 @@ def generate_tokens(readline, line_offset=0):
    contline = None
    new_line = False
    prefix = ''  # Should never be required, but here for safety
+    additional_prefix = ''
    while True:            # loop over lines in stream
        line = readline()  # readline returns empty when finished. See StringIO
        if not line:
@@ -192,7 +196,8 @@ def generate_tokens(readline, line_offset=0):
                pos += 1
                continue

-            prefix = pseudomatch.group(1)
+            prefix = pseudomatch.group(1) + additional_prefix
+            additional_prefix = ''
            start, pos = pseudomatch.span(2)
            spos = (lnum, start)
            token, initial = line[start:pos], line[start]
@@ -213,10 +218,12 @@ def generate_tokens(readline, line_offset=0):
            elif initial in '\r\n':
                if not new_line and paren_level == 0:
                    yield NEWLINE, token, spos, prefix
+                else:
+                    additional_prefix = prefix + token
                new_line = True
-            elif initial == '#':
+            elif initial == '#':  # Comments
                assert not token.endswith("\n")
-                #yield Token(COMMENT, token, spos, prefix)
+                additional_prefix = prefix + token
            elif token in triple_quoted:
                endprog = endprogs[token]
                endmatch = endprog.match(line, pos)
--- a/jedi/parser/user_context.py
+++ b/jedi/parser/user_context.py
@@ -59,7 +59,7 @@ class UserContext(object):
            first_line = (tok_str.splitlines() or [''])[0]
            column -= len(first_line)
            # Reverse the token again, so that it is in normal order again.
-            yield typ, tok_str[::-1], (self._line_temp, column), prefix
+            yield typ, tok_str[::-1], (self._line_temp, column), prefix[::-1]

    def _calc_path_until_cursor(self, start_pos):
        """
@@ -122,7 +122,7 @@ class UserContext(object):
                    break

            start_cursor = tok_start_pos
-            string = tok_str + string
+            string = tok_str + prefix + string
            last_type = tok_type

        # Don't need whitespace around a statement.
--- a/test/completion/goto.py
+++ b/test/completion/goto.py
@@ -181,13 +181,13 @@ ab1(ClassDef);ab2(ClassDef);ab3(ClassDef)
 # -----------------

 for i in range(1):
-    #! ['for i in range(1):    i']
+    #! ['for i in range(1): i']
    i

 for key, value in [(1,2)]:
-    #! ['for key, value in [(1,2)]:    key']
+    #! ['for key, value in [(1,2)]: key']
    key

 for i in []:
-    #! ['for i in []:    i']
+    #! ['for i in []: i']
    i