mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-23 12:41:43 +08:00
Move the tokenizer/diff parser closer together
This commit is contained in:
@@ -397,46 +397,30 @@ class DiffParser(object):
|
|||||||
return self._active_parser.parse(tokens=tokens)
|
return self._active_parser.parse(tokens=tokens)
|
||||||
|
|
||||||
def _diff_tokenize(self, lines, until_line, line_offset=0):
|
def _diff_tokenize(self, lines, until_line, line_offset=0):
|
||||||
is_first_token = True
|
|
||||||
omitted_first_indent = False
|
|
||||||
was_newline = False
|
was_newline = False
|
||||||
indents = []
|
|
||||||
|
|
||||||
first_token = next(self._tokenizer(lines))
|
first_token = next(self._tokenizer(lines))
|
||||||
base_indentation = self._nodes_tree.get_base_indentation(first_token.start_pos[1])
|
indents = list(self._nodes_tree.get_indents(first_token.start_pos[1]))
|
||||||
if base_indentation > 0:
|
initial_indentation_count = len(indents)
|
||||||
omitted_first_indent = True
|
|
||||||
indents.append(base_indentation)
|
|
||||||
|
|
||||||
tokens = self._tokenizer(
|
tokens = self._tokenizer(
|
||||||
lines,
|
lines,
|
||||||
start_pos=(1, 0),
|
start_pos=(1, 0),
|
||||||
base_indentation=base_indentation
|
indents=indents
|
||||||
)
|
)
|
||||||
stack = self._active_parser.stack
|
stack = self._active_parser.stack
|
||||||
for typ, string, start_pos, prefix in tokens:
|
for typ, string, start_pos, prefix in tokens:
|
||||||
start_pos = start_pos[0] + line_offset, start_pos[1]
|
start_pos = start_pos[0] + line_offset, start_pos[1]
|
||||||
if typ == PythonTokenTypes.INDENT:
|
|
||||||
indents.append(start_pos[1])
|
|
||||||
if is_first_token and base_indentation >= start_pos[1]:
|
|
||||||
omitted_first_indent = True
|
|
||||||
# We want to get rid of indents that are only here because
|
|
||||||
# we only parse part of the file. These indents would only
|
|
||||||
# get parsed as error leafs, which doesn't make any sense.
|
|
||||||
is_first_token = False
|
|
||||||
continue
|
|
||||||
is_first_token = False
|
|
||||||
|
|
||||||
# In case of omitted_first_indent, it might not be dedented fully.
|
if typ == PythonTokenTypes.DEDENT:
|
||||||
# However this is a sign for us that a dedent happened.
|
if len(indents) < initial_indentation_count:
|
||||||
if typ == PythonTokenTypes.DEDENT \
|
|
||||||
or typ == PythonTokenTypes.ERROR_DEDENT \
|
|
||||||
and omitted_first_indent and len(indents) == 1:
|
|
||||||
indents.pop()
|
|
||||||
if omitted_first_indent and not indents:
|
|
||||||
# We are done here, only thing that can come now is an
|
# We are done here, only thing that can come now is an
|
||||||
# endmarker or another dedented code block.
|
# endmarker or another dedented code block.
|
||||||
|
while True:
|
||||||
typ, string, start_pos, prefix = next(tokens)
|
typ, string, start_pos, prefix = next(tokens)
|
||||||
|
if typ != PythonTokenTypes.DEDENT:
|
||||||
|
break
|
||||||
|
|
||||||
if '\n' in prefix or '\r' in prefix:
|
if '\n' in prefix or '\r' in prefix:
|
||||||
prefix = re.sub(r'[^\n\r]+\Z', '', prefix)
|
prefix = re.sub(r'[^\n\r]+\Z', '', prefix)
|
||||||
else:
|
else:
|
||||||
@@ -453,15 +437,9 @@ class DiffParser(object):
|
|||||||
was_newline = True
|
was_newline = True
|
||||||
elif was_newline:
|
elif was_newline:
|
||||||
was_newline = False
|
was_newline = False
|
||||||
if start_pos[1] <= base_indentation:
|
if len(indents) == initial_indentation_count:
|
||||||
# Check if the parser is actually in a valid suite state.
|
# Check if the parser is actually in a valid suite state.
|
||||||
if _suite_or_file_input_is_valid(self._pgen_grammar, stack):
|
if _suite_or_file_input_is_valid(self._pgen_grammar, stack):
|
||||||
start_pos = start_pos[0] + 1, 0
|
|
||||||
if typ == PythonTokenTypes.INDENT:
|
|
||||||
indents.pop()
|
|
||||||
while len(indents) > int(omitted_first_indent):
|
|
||||||
indents.pop()
|
|
||||||
yield PythonToken(PythonTokenTypes.DEDENT, '', start_pos, '')
|
|
||||||
yield PythonToken(PythonTokenTypes.ENDMARKER, '', start_pos, '')
|
yield PythonToken(PythonTokenTypes.ENDMARKER, '', start_pos, '')
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -562,11 +540,11 @@ class _NodesTree(object):
|
|||||||
self._prefix_remainder = ''
|
self._prefix_remainder = ''
|
||||||
self.prefix = ''
|
self.prefix = ''
|
||||||
|
|
||||||
def get_base_indentation(self, indentation):
|
def get_indents(self, indentation):
|
||||||
for node in reversed(self._working_stack):
|
for node in self._working_stack:
|
||||||
first_indentation = node.get_first_indentation()
|
first_indentation = node.get_first_indentation()
|
||||||
if indentation >= first_indentation:
|
if indentation >= first_indentation:
|
||||||
return first_indentation
|
yield first_indentation
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def parsed_until_line(self):
|
def parsed_until_line(self):
|
||||||
|
|||||||
@@ -386,7 +386,7 @@ def _print_tokens(func):
|
|||||||
|
|
||||||
|
|
||||||
# @_print_tokens
|
# @_print_tokens
|
||||||
def tokenize_lines(lines, version_info, start_pos=(1, 0), base_indentation=0):
|
def tokenize_lines(lines, version_info, start_pos=(1, 0), indents=None):
|
||||||
"""
|
"""
|
||||||
A heavily modified Python standard library tokenizer.
|
A heavily modified Python standard library tokenizer.
|
||||||
|
|
||||||
@@ -400,16 +400,15 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0), base_indentation=0):
|
|||||||
yield PythonToken(ERROR_DEDENT, '', (lnum, start), '')
|
yield PythonToken(ERROR_DEDENT, '', (lnum, start), '')
|
||||||
indents[-1] = start
|
indents[-1] = start
|
||||||
break
|
break
|
||||||
yield PythonToken(DEDENT, '', spos, '')
|
|
||||||
indents.pop()
|
indents.pop()
|
||||||
|
yield PythonToken(DEDENT, '', spos, '')
|
||||||
|
|
||||||
pseudo_token, single_quoted, triple_quoted, endpats, whitespace, \
|
pseudo_token, single_quoted, triple_quoted, endpats, whitespace, \
|
||||||
fstring_pattern_map, always_break_tokens, = \
|
fstring_pattern_map, always_break_tokens, = \
|
||||||
_get_token_collection(version_info)
|
_get_token_collection(version_info)
|
||||||
paren_level = 0 # count parentheses
|
paren_level = 0 # count parentheses
|
||||||
|
if indents is None:
|
||||||
indents = [0]
|
indents = [0]
|
||||||
if base_indentation:
|
|
||||||
indents.append(base_indentation)
|
|
||||||
max = 0
|
max = 0
|
||||||
numchars = '0123456789'
|
numchars = '0123456789'
|
||||||
contstr = ''
|
contstr = ''
|
||||||
@@ -670,6 +669,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0), base_indentation=0):
|
|||||||
# As the last position we just take the maximally possible position. We
|
# As the last position we just take the maximally possible position. We
|
||||||
# remove -1 for the last new line.
|
# remove -1 for the last new line.
|
||||||
for indent in indents[1:]:
|
for indent in indents[1:]:
|
||||||
|
indents.pop()
|
||||||
yield PythonToken(DEDENT, '', end_pos, '')
|
yield PythonToken(DEDENT, '', end_pos, '')
|
||||||
yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)
|
yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user