1
0
forked from VimPlug/jedi

The new tokenizer is more or less working now. Indents are calculated as they should

This commit is contained in:
Dave Halter
2014-11-27 16:03:58 +01:00
parent c0df7003a5
commit 97516eb26b
3 changed files with 47 additions and 6 deletions

View File

@@ -131,7 +131,7 @@ class Parser(object):
p = pgen2.parse.Parser(grammar, self.convert_node, self.convert_leaf,
self.error_recovery)
tokenizer = tokenizer or tokenize.source_tokens(source)
self.module = p.parse(self._tokenize(tokenizer))
self.module = p.parse(p.tokenize(self._tokenize(tokenizer)))
self.module.used_names = self.used_names
self.module.path = module_path
@@ -247,6 +247,18 @@ class Parser(object):
stack[start_index:] = []
def _tokenize(self, tokenizer):
"""
while first_pos[1] <= self._scope.start_pos[1] \
and (token_type == tokenize.NAME or tok_str in ('(', '['))\
and self._scope != self.module:
self._scope.end_pos = first_pos
self._scope = self._scope.parent
if isinstance(self._scope, pr.Module) \
and not isinstance(self._scope, pr.SubModule):
self._scope = self.module
"""
new_scope = False
for token in tokenizer:
typ = token.type
value = token.value

View File

@@ -96,9 +96,32 @@ class Parser(object):
self.stack = [stackentry]
self.rootnode = None
self.error_recovery = error_recovery
indent_errors = [] # TODO generate those.
def parse(self, tokens):
for type, value, prefix, start_pos in tokens:
def tokenize(self, tokenizer):
"""
This is not a real tokenizer, but it adds indents. You could hand the
parse function a normal tokenizer (e.g. the lib2to3 one). But if we use
the parser stack we are able to do error recovery from wrong indents.
"""
indents = [0]
new_line = False
for type, value, prefix, start_pos in tokenizer:
if type == token.NEWLINE:
new_line = True
elif new_line:
indent = start_pos[1]
if indent > indents[-1]:
yield token.INDENT, '', '', start_pos
indents.append(indent)
while indent < indents[-1]:
yield token.DEDENT, '', '', start_pos
indents.pop()
new_line = False
yield type, value, prefix, start_pos
def parse(self, tokenizer):
for type, value, prefix, start_pos in tokenizer:
if self.addtoken(type, value, prefix, start_pos):
break
else:

View File

@@ -66,7 +66,7 @@ class Token(object):
self.value = value
self._start_pos_line = start_pos[0]
self._start_pos_col = start_pos[1]
self.prefix = whitespace
self.prefix = prefix
def __repr__(self):
typ = tok_name[self.type]
@@ -228,6 +228,7 @@ def generate_tokens(readline, line_offset=0):
numchars = '0123456789'
contstr = ''
contline = None
new_line = False
prefix = '' # Should never be required, but here for safety
while True: # loop over lines in stream
line = readline() # readline returns empty when finished. See StringIO
@@ -269,14 +270,19 @@ def generate_tokens(readline, line_offset=0):
spos = (lnum, start)
token, initial = line[start:pos], line[start]
if new_line and initial not in '\r\n#':
new_line = False
if (initial in numchars or # ordinary number
(initial == '.' and token != '.' and token != '...')):
yield Token(NUMBER, token, spos, prefix)
elif initial in '\r\n':
yield Token(NEWLINE, token, spos, prefix)
if not new_line:
yield Token(NEWLINE, token, spos, prefix)
new_line = True
elif initial == '#':
assert not token.endswith("\n")
yield Token(COMMENT, token, spos, prefix)
#yield Token(COMMENT, token, spos, prefix)
elif token in triple_quoted:
endprog = endprogs[token]
endmatch = endprog.match(line, pos)