From fb2ea551d570e865727eaa2d43ec7c0d22c44df5 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Fri, 3 Apr 2020 00:18:35 +0200 Subject: [PATCH] Move the tokenizer/diff parser closer together --- parso/python/diff.py | 50 +++++++++++----------------------------- parso/python/tokenize.py | 10 ++++---- 2 files changed, 19 insertions(+), 41 deletions(-) diff --git a/parso/python/diff.py b/parso/python/diff.py index 986fad2..9d56c8c 100644 --- a/parso/python/diff.py +++ b/parso/python/diff.py @@ -397,46 +397,30 @@ class DiffParser(object): return self._active_parser.parse(tokens=tokens) def _diff_tokenize(self, lines, until_line, line_offset=0): - is_first_token = True - omitted_first_indent = False was_newline = False - indents = [] first_token = next(self._tokenizer(lines)) - base_indentation = self._nodes_tree.get_base_indentation(first_token.start_pos[1]) - if base_indentation > 0: - omitted_first_indent = True - indents.append(base_indentation) + indents = list(self._nodes_tree.get_indents(first_token.start_pos[1])) + initial_indentation_count = len(indents) tokens = self._tokenizer( lines, start_pos=(1, 0), - base_indentation=base_indentation + indents=indents ) stack = self._active_parser.stack for typ, string, start_pos, prefix in tokens: start_pos = start_pos[0] + line_offset, start_pos[1] - if typ == PythonTokenTypes.INDENT: - indents.append(start_pos[1]) - if is_first_token and base_indentation >= start_pos[1]: - omitted_first_indent = True - # We want to get rid of indents that are only here because - # we only parse part of the file. These indents would only - # get parsed as error leafs, which doesn't make any sense. - is_first_token = False - continue - is_first_token = False - # In case of omitted_first_indent, it might not be dedented fully. - # However this is a sign for us that a dedent happened. - if typ == PythonTokenTypes.DEDENT \ - or typ == PythonTokenTypes.ERROR_DEDENT \ - and omitted_first_indent and len(indents) == 1: - indents.pop() - if omitted_first_indent and not indents: + if typ == PythonTokenTypes.DEDENT: + if len(indents) < initial_indentation_count: # We are done here, only thing that can come now is an # endmarker or another dedented code block. - typ, string, start_pos, prefix = next(tokens) + while True: + typ, string, start_pos, prefix = next(tokens) + if typ != PythonTokenTypes.DEDENT: + break + if '\n' in prefix or '\r' in prefix: prefix = re.sub(r'[^\n\r]+\Z', '', prefix) else: @@ -453,15 +437,9 @@ class DiffParser(object): was_newline = True elif was_newline: was_newline = False - if start_pos[1] <= base_indentation: + if len(indents) == initial_indentation_count: # Check if the parser is actually in a valid suite state. if _suite_or_file_input_is_valid(self._pgen_grammar, stack): - start_pos = start_pos[0] + 1, 0 - if typ == PythonTokenTypes.INDENT: - indents.pop() - while len(indents) > int(omitted_first_indent): - indents.pop() - yield PythonToken(PythonTokenTypes.DEDENT, '', start_pos, '') yield PythonToken(PythonTokenTypes.ENDMARKER, '', start_pos, '') break @@ -562,11 +540,11 @@ class _NodesTree(object): self._prefix_remainder = '' self.prefix = '' - def get_base_indentation(self, indentation): - for node in reversed(self._working_stack): + def get_indents(self, indentation): + for node in self._working_stack: first_indentation = node.get_first_indentation() if indentation >= first_indentation: - return first_indentation + yield first_indentation @property def parsed_until_line(self): diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 0f91aaa..05cac39 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -386,7 +386,7 @@ def _print_tokens(func): # @_print_tokens -def tokenize_lines(lines, version_info, start_pos=(1, 0), base_indentation=0): +def tokenize_lines(lines, version_info, start_pos=(1, 0), indents=None): """ A heavily modified Python standard library tokenizer. @@ -400,16 +400,15 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0), base_indentation=0): yield PythonToken(ERROR_DEDENT, '', (lnum, start), '') indents[-1] = start break - yield PythonToken(DEDENT, '', spos, '') indents.pop() + yield PythonToken(DEDENT, '', spos, '') pseudo_token, single_quoted, triple_quoted, endpats, whitespace, \ fstring_pattern_map, always_break_tokens, = \ _get_token_collection(version_info) paren_level = 0 # count parentheses - indents = [0] - if base_indentation: - indents.append(base_indentation) + if indents is None: + indents = [0] max = 0 numchars = '0123456789' contstr = '' @@ -670,6 +669,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0), base_indentation=0): # As the last position we just take the maximally possible position. We # remove -1 for the last new line. for indent in indents[1:]: + indents.pop() yield PythonToken(DEDENT, '', end_pos, '') yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)