1
0
forked from VimPlug/jedi

on the way to a better fast_parser - improved a lot of the positioning stuff

This commit is contained in:
David Halter
2013-03-24 22:51:17 +04:30
parent df058b93c2
commit a99d9541bd
3 changed files with 101 additions and 40 deletions

View File

@@ -6,6 +6,8 @@ from _compatibility import next
import debug import debug
import settings import settings
FLOWS = ['if', 'else', 'elif', 'while', 'with', 'try', 'except', 'finally']
class MultiLevelStopIteration(Exception): class MultiLevelStopIteration(Exception):
""" """
@@ -56,14 +58,21 @@ class PushBackIterator(object):
class NoErrorTokenizer(object): class NoErrorTokenizer(object):
def __init__(self, readline, offset=(0, 0), stop_on_scope=False): def __init__(self, readline, offset=(0, 0), is_fast_parser=False):
self.readline = readline self.readline = readline
self.gen = PushBackIterator(tokenize.generate_tokens(readline)) self.gen = PushBackIterator(tokenize.generate_tokens(readline))
self.offset = offset self.offset = offset
self.stop_on_scope = stop_on_scope
self.first_scope = False
self.closed = False self.closed = False
self.first = True self.is_first = True
# fast parser options
self.is_fast_parser = is_fast_parser
self.current = self.previous = [None, None, (0, 0), (0, 0), '']
self.in_flow = False
self.new_indent = False
self.parser_indent = 0
self.is_decorator = False
self.first_stmt = True
def push_last_back(self): def push_last_back(self):
self.gen.push_back(self.current) self.gen.push_back(self.current)
@@ -76,6 +85,8 @@ class NoErrorTokenizer(object):
if self.closed: if self.closed:
raise MultiLevelStopIteration() raise MultiLevelStopIteration()
try: try:
last_previous = self.previous
self.previous = self.current
self.current = next(self.gen) self.current = next(self.gen)
except tokenize.TokenError: except tokenize.TokenError:
# We just ignore this error, I try to handle it earlier - as # We just ignore this error, I try to handle it earlier - as
@@ -99,22 +110,60 @@ class NoErrorTokenizer(object):
c = list(self.current) c = list(self.current)
# stop if a new class or definition is started at position zero. if c[0] == tokenize.ENDMARKER:
breaks = ['def', 'class', '@'] self.current = self.previous
if self.stop_on_scope and c[1] in breaks and c[2][1] == 0: self.previous = last_previous
if self.first_scope: raise MultiLevelStopIteration()
self.closed = True
raise MultiLevelStopIteration()
elif c[1] != '@':
self.first_scope = True
if self.first: # this is exactly the same check as in fast_parser, but this time with
# tokenize and therefore precise.
breaks = ['def', 'class', '@']
if self.is_first:
c[2] = self.offset[0] + c[2][0], self.offset[1] + c[2][1] c[2] = self.offset[0] + c[2][0], self.offset[1] + c[2][1]
c[3] = self.offset[0] + c[3][0], self.offset[1] + c[3][1] c[3] = self.offset[0] + c[3][0], self.offset[1] + c[3][1]
self.first = False self.is_first = False
else: else:
c[2] = self.offset[0] + c[2][0], c[2][1] c[2] = self.offset[0] + c[2][0], c[2][1]
c[3] = self.offset[0] + c[3][0], c[3][1] c[3] = self.offset[0] + c[3][0], c[3][1]
print 'h', c, tokenize.tok_name[c[0]], self.current[2:4]
self.current = c
def close():
if not self.first_stmt:
self.closed = True
raise MultiLevelStopIteration()
# ignore indents/comments
if self.is_fast_parser \
and self.previous[0] in (tokenize.INDENT, tokenize.NL, None,
tokenize.NEWLINE, tokenize.DEDENT) \
and c[0] not in (tokenize.COMMENT, tokenize.INDENT,
tokenize.NL, tokenize.NEWLINE, tokenize.DEDENT):
print c, tokenize.tok_name[c[0]]
tok = c[1]
indent = c[2][1]
if indent < self.parser_indent: # -> dedent
self.parser_indent = indent
self.new_indent = False
if not self.in_flow:
close()
self.in_flow = False
elif self.new_indent:
self.parser_indent = indent
if not self.in_flow:
if tok in FLOWS or tok in breaks:
self.in_flow = tok in FLOWS
if not self.is_decorator and not self.in_flow:
close()
self.is_decorator = '@' == tok
if not self.is_decorator:
self.parser_indent += 1 # new scope: must be higher
self.new_indent = True
if tok != '@':
self.first_stmt = False
return c return c

View File

@@ -11,6 +11,7 @@ import settings
import parsing import parsing
import parsing_representation as pr import parsing_representation as pr
import cache import cache
import common
class Module(pr.Simple, pr.Module): class Module(pr.Simple, pr.Module):
@@ -219,17 +220,15 @@ class FastParser(use_metaclass(CachedFastParser)):
parts.append(txt) parts.append(txt)
current_lines[:] = [] current_lines[:] = []
flows = ['if', 'else', 'elif', 'while', 'with', 'try', 'except', r_keyword = '^[ \t]*(def|class|@|%s)' % '|'.join(common.FLOWS)
'finally']
r_keyword = '^[ \t]*(def|class|@|%s)' % '|'.join(flows)
lines = code.splitlines() lines = code.splitlines()
current_lines = [] current_lines = []
parts = [] parts = []
is_generator = False is_decorator = False
current_indent = 0 current_indent = 0
new_indent = False new_indent = False
is_flow = False in_flow = False
# All things within flows are simply being ignored. # All things within flows are simply being ignored.
for i, l in enumerate(lines): for i, l in enumerate(lines):
# check for dedents # check for dedents
@@ -242,29 +241,35 @@ class FastParser(use_metaclass(CachedFastParser)):
if indent < current_indent: # -> dedent if indent < current_indent: # -> dedent
current_indent = indent current_indent = indent
new_indent = False new_indent = False
if not is_flow: if not in_flow:
add_part() add_part()
is_flow = False in_flow = False
elif new_indent: elif new_indent:
current_indent = indent current_indent = indent
new_indent = False new_indent = False
# Check lines for functions/classes and split the code there. # Check lines for functions/classes and split the code there.
if not is_flow: if not in_flow:
m = re.match(r_keyword, l) m = re.match(r_keyword, l)
if m: if m:
is_flow = m.group(1) in flows in_flow = m.group(1) in common.FLOWS
if not is_generator and not is_flow: if not is_decorator and not in_flow:
add_part() add_part()
current_lines = [] current_lines = []
is_generator = '@' == m.group(1) is_decorator = '@' == m.group(1)
if not is_generator: if not is_decorator:
current_indent += 1 # it must be higher current_indent += 1 # it must be higher
new_indent = True new_indent = True
current_lines.append(l) current_lines.append(l)
add_part() add_part()
for p in parts:
#print '#####################################'
#print p
#print len(p.splitlines())
pass
return parts return parts
def _parse(self, code): def _parse(self, code):
@@ -280,11 +285,12 @@ class FastParser(use_metaclass(CachedFastParser)):
el = module.imports[0] el = module.imports[0]
return el.start_pos[1] return el.start_pos[1]
if self.parsers: if self.parsers and False:
new_indent = get_indent(module) new_indent = get_indent(module)
old_indent = get_indent(self.parsers[-1].module) old_indent = get_indent(self.parsers[-1].module)
if old_indent < new_indent: if old_indent < new_indent:
module.parent = self.parsers[-1].module.subscopes[0] #module.parent = self.parsers[-1].module.subscopes[0]
# TODO set parents + add to subscopes
return return
p.module.parent = self.module p.module.parent = self.module
@@ -301,7 +307,7 @@ class FastParser(use_metaclass(CachedFastParser)):
p = None p = None
parser_order = 0 parser_order = 0
for code_part in parts: for code_part in parts:
lines = code_part.count('\n') lines = code_part.count('\n') + 1
# the parser is using additional newlines, therefore substract # the parser is using additional newlines, therefore substract
if p is None or line_offset >= p.end_pos[0] - 2: if p is None or line_offset >= p.end_pos[0] - 2:
# check if code_part has already been parsed # check if code_part has already been parsed
@@ -336,8 +342,13 @@ class FastParser(use_metaclass(CachedFastParser)):
parser_order += 1 parser_order += 1
line_offset += lines line_offset += lines
start += len(code_part) print line_offset
start += len(code_part) + 1 # +1 for newline
self.parsers[parser_order + 1:] = [] self.parsers[parser_order + 1:] = []
for p in self.parsers:
print(p.module.get_code())
print(p.module.start_pos, p.module.end_pos)
exit()
def reset_caches(self): def reset_caches(self):
self._user_scope = None self._user_scope = None

View File

@@ -52,13 +52,11 @@ class Parser(object):
self.user_stmt = None self.user_stmt = None
self.no_docstr = no_docstr self.no_docstr = no_docstr
self.start_pos = self.end_pos = 1 + offset[0], offset[1]
# initialize global Scope # initialize global Scope
self.module = pr.SubModule(module_path, (offset[0] + 1, offset[1]), self.module = pr.SubModule(module_path, self.start_pos, top_module)
top_module)
self.scope = self.module self.scope = self.module
self.current = (None, None) self.current = (None, None)
self.start_pos = 1, 0
self.end_pos = 1, 0
source = source + '\n' # end with \n, because the parser needs it source = source + '\n' # end with \n, because the parser needs it
buf = StringIO(source) buf = StringIO(source)
@@ -79,6 +77,10 @@ class Parser(object):
# because of `self.module.used_names`. # because of `self.module.used_names`.
d.parent = self.module d.parent = self.module
if self.current[0] in (tokenize.NL, tokenize.NEWLINE):
# we added a newline before, so we need to "remove" it again.
self.end_pos = self._gen.previous[2]
self.start_pos = self.module.start_pos self.start_pos = self.module.start_pos
self.module.end_pos = self.end_pos self.module.end_pos = self.end_pos
del self._gen del self._gen
@@ -170,8 +172,6 @@ class Parser(object):
while True: while True:
defunct = False defunct = False
token_type, tok = self.next() token_type, tok = self.next()
if token_type == tokenize.ENDMARKER:
break
if brackets and tok == '\n': if brackets and tok == '\n':
self.next() self.next()
if tok == '(': # python allows only one `(` in the statement. if tok == '(': # python allows only one `(` in the statement.
@@ -421,8 +421,10 @@ class Parser(object):
def __next__(self): def __next__(self):
""" Generate the next tokenize pattern. """ """ Generate the next tokenize pattern. """
try: try:
typ, tok, self.start_pos, self.end_pos, \ typ, tok, start_pos, end_pos, self.parserline = next(self._gen)
self.parserline = next(self._gen) # dedents shouldn't change positions
if typ != tokenize.DEDENT:
self.start_pos, self.end_pos = start_pos, end_pos
except (StopIteration, common.MultiLevelStopIteration): except (StopIteration, common.MultiLevelStopIteration):
# on finish, set end_pos correctly # on finish, set end_pos correctly
s = self.scope s = self.scope
@@ -662,7 +664,6 @@ class Parser(object):
self.freshscope = False self.freshscope = False
else: else:
if token_type not in [tokenize.COMMENT, tokenize.INDENT, if token_type not in [tokenize.COMMENT, tokenize.INDENT,
tokenize.NEWLINE, tokenize.NL, tokenize.NEWLINE, tokenize.NL]:
tokenize.ENDMARKER]:
debug.warning('token not classified', tok, token_type, debug.warning('token not classified', tok, token_type,
self.start_pos[0]) self.start_pos[0])