Move the tokenizer/diff parser closer together

2025-12-23 12:41:43 +08:00 · 2020-04-03 00:18:35 +02:00
parent ce170e8aae
commit fb2ea551d5
2 changed files with 19 additions and 41 deletions
--- a/parso/python/diff.py
+++ b/parso/python/diff.py
@@ -397,46 +397,30 @@ class DiffParser(object):
        return self._active_parser.parse(tokens=tokens)
    def _diff_tokenize(self, lines, until_line, line_offset=0):
        is_first_token = True
        omitted_first_indent = False
        was_newline = False
        indents = []
        first_token = next(self._tokenizer(lines))
-        base_indentation = self._nodes_tree.get_base_indentation(first_token.start_pos[1])
+        indents = list(self._nodes_tree.get_indents(first_token.start_pos[1]))
-        if base_indentation > 0:
+        initial_indentation_count = len(indents)
            omitted_first_indent = True
            indents.append(base_indentation)
        tokens = self._tokenizer(
            lines,
            start_pos=(1, 0),
-            base_indentation=base_indentation
+            indents=indents
        )
        stack = self._active_parser.stack
        for typ, string, start_pos, prefix in tokens:
            start_pos = start_pos[0] + line_offset, start_pos[1]
            if typ == PythonTokenTypes.INDENT:
                indents.append(start_pos[1])
                if is_first_token and base_indentation >= start_pos[1]:
                    omitted_first_indent = True
                    # We want to get rid of indents that are only here because
                    # we only parse part of the file. These indents would only
                    # get parsed as error leafs, which doesn't make any sense.
                    is_first_token = False
                    continue
            is_first_token = False
-            # In case of omitted_first_indent, it might not be dedented fully.
+            if typ == PythonTokenTypes.DEDENT:
-            # However this is a sign for us that a dedent happened.
+                if len(indents) < initial_indentation_count:
            if typ == PythonTokenTypes.DEDENT \
                    or typ == PythonTokenTypes.ERROR_DEDENT \
                    and omitted_first_indent and len(indents) == 1:
                indents.pop()
                if omitted_first_indent and not indents:
                    # We are done here, only thing that can come now is an
                    # endmarker or another dedented code block.
                    while True:
                        typ, string, start_pos, prefix = next(tokens)
                        if typ != PythonTokenTypes.DEDENT:
                            break
                    if '\n' in prefix or '\r' in prefix:
                        prefix = re.sub(r'[^\n\r]+\Z', '', prefix)
                    else:
@@ -453,15 +437,9 @@ class DiffParser(object):
                was_newline = True
            elif was_newline:
                was_newline = False
-                if start_pos[1] <= base_indentation:
+                if len(indents) == initial_indentation_count:
                    # Check if the parser is actually in a valid suite state.
                    if _suite_or_file_input_is_valid(self._pgen_grammar, stack):
                        start_pos = start_pos[0] + 1, 0
                        if typ == PythonTokenTypes.INDENT:
                            indents.pop()
                        while len(indents) > int(omitted_first_indent):
                            indents.pop()
                            yield PythonToken(PythonTokenTypes.DEDENT, '', start_pos, '')
                        yield PythonToken(PythonTokenTypes.ENDMARKER, '', start_pos, '')
                        break
@@ -562,11 +540,11 @@ class _NodesTree(object):
        self._prefix_remainder = ''
        self.prefix = ''
-    def get_base_indentation(self, indentation):
+    def get_indents(self, indentation):
-        for node in reversed(self._working_stack):
+        for node in self._working_stack:
            first_indentation = node.get_first_indentation()
            if indentation >= first_indentation:
-                return first_indentation
+                yield first_indentation
    @property
    def parsed_until_line(self):
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -386,7 +386,7 @@ def _print_tokens(func):
 # @_print_tokens
-def tokenize_lines(lines, version_info, start_pos=(1, 0), base_indentation=0):
+def tokenize_lines(lines, version_info, start_pos=(1, 0), indents=None):
    """
    A heavily modified Python standard library tokenizer.
@@ -400,16 +400,15 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0), base_indentation=0):
                yield PythonToken(ERROR_DEDENT, '', (lnum, start), '')
                indents[-1] = start
                break
            yield PythonToken(DEDENT, '', spos, '')
            indents.pop()
            yield PythonToken(DEDENT, '', spos, '')
    pseudo_token, single_quoted, triple_quoted, endpats, whitespace, \
        fstring_pattern_map, always_break_tokens, = \
        _get_token_collection(version_info)
    paren_level = 0  # count parentheses
    if indents is None:
        indents = [0]
    if base_indentation:
        indents.append(base_indentation)
    max = 0
    numchars = '0123456789'
    contstr = ''
@@ -670,6 +669,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0), base_indentation=0):
    # As the last position we just take the maximally possible position. We
    # remove -1 for the last new line.
    for indent in indents[1:]:
        indents.pop()
        yield PythonToken(DEDENT, '', end_pos, '')
    yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)