start using tokens all the way through in the parser

This commit is contained in:
Dave Halter
2014-02-24 20:35:36 +01:00
parent 6058e8b9c3
commit 50f8b8bf0c
+104 -100
View File
@@ -97,8 +97,7 @@ class Parser(object):
The dot name parser parses a name, variable or function and returns The dot name parser parses a name, variable or function and returns
their names. their names.
:return: Tuple of Name, token_type, nexttoken. :return: tuple of Name, next_token
:rtype: tuple(Name, int, str)
""" """
def append(el): def append(el):
names.append(el) names.append(el)
@@ -106,30 +105,30 @@ class Parser(object):
names = [] names = []
if pre_used_token is None: if pre_used_token is None:
token_type, tok = self.next() tok = self.next()
if token_type != tokenize.NAME and tok != '*': if tok.type != tokenize.NAME and tok.string != '*':
return [], token_type, tok return [], tok # TODO the fuck, why []?
else: else:
token_type, tok = pre_used_token tok = pre_used_token
if token_type != tokenize.NAME and tok != '*': if tok.type != tokenize.NAME and tok.string != '*':
# token maybe a name or star # token maybe a name or star
return None, token_type, tok return None, tok
append((tok, self.start_pos)) append((tok.string, self.start_pos))
first_pos = self.start_pos first_pos = self.start_pos
while True: while True:
end_pos = self.end_pos end_pos = self.end_pos
token_type, tok = self.next() tok = self.next()
if tok != '.': if tok.string != '.':
break break
token_type, tok = self.next() tok = self.next()
if token_type != tokenize.NAME: if tok.type != tokenize.NAME:
break break
append((tok, self.start_pos)) append((tok.string, self.start_pos))
n = pr.Name(self.module, names, first_pos, end_pos) if names else None n = pr.Name(self.module, names, first_pos, end_pos) if names else None
return n, token_type, tok return n, tok
def _parse_import_list(self): def _parse_import_list(self):
""" """
@@ -153,22 +152,22 @@ class Parser(object):
+ list(set(keyword.kwlist) - set(['as'])) + list(set(keyword.kwlist) - set(['as']))
while True: while True:
defunct = False defunct = False
token_type, tok = self.next() tok = self.next()
if tok == '(': # python allows only one `(` in the statement. if tok.string == '(': # python allows only one `(` in the statement.
brackets = True brackets = True
token_type, tok = self.next() tok = self.next()
if brackets and tok == '\n': if brackets and tok.string == '\n':
token_type, tok = self.next() tok = self.next()
i, token_type, tok = self._parse_dot_name(self._current) i, tok = self._parse_dot_name(tok)
if not i: if not i:
defunct = True defunct = True
name2 = None name2 = None
if tok == 'as': if tok.string == 'as':
name2, token_type, tok = self._parse_dot_name() name2, tok = self._parse_dot_name()
imports.append((i, name2, defunct)) imports.append((i, name2, defunct))
while tok not in continue_kw: while tok.string not in continue_kw:
token_type, tok = self.next() tok = self.next()
if not (tok == "," or brackets and tok == '\n'): if not (tok.string == "," or brackets and tok.string == '\n'):
break break
return imports return imports
@@ -184,10 +183,10 @@ class Parser(object):
tok = None tok = None
pos = 0 pos = 0
breaks = [',', ':'] breaks = [',', ':']
while tok not in [')', ':']: while tok is None or tok.string not in [')', ':']:
param, tok = self._parse_statement(added_breaks=breaks, param, tok = self._parse_statement(added_breaks=breaks,
stmt_class=pr.Param) stmt_class=pr.Param)
if param and tok == ':': if param and tok.string == ':':
# parse annotations # parse annotations
annotation, tok = self._parse_statement(added_breaks=breaks) annotation, tok = self._parse_statement(added_breaks=breaks)
if annotation: if annotation:
@@ -210,30 +209,30 @@ class Parser(object):
:rtype: Function :rtype: Function
""" """
first_pos = self.start_pos first_pos = self.start_pos
token_type, fname = self.next() tok = self.next()
if token_type != tokenize.NAME: if tok.type != tokenize.NAME:
return None return None
fname = pr.Name(self.module, [(fname, self.start_pos)], self.start_pos, fname = pr.Name(self.module, [(tok.string, self.start_pos)], self.start_pos,
self.end_pos) self.end_pos)
token_type, open = self.next() tok = self.next()
if open != '(': if tok.string != '(':
return None return None
params = self._parse_parentheses() params = self._parse_parentheses()
token_type, colon = self.next() colon = self.next()
annotation = None annotation = None
if colon in ['-', '->']: if colon.string in ['-', '->']:
# parse annotations # parse annotations
if colon == '-': if colon.string == '-':
# The Python 2 tokenizer doesn't understand this # The Python 2 tokenizer doesn't understand this
token_type, colon = self.next() colon = self.next()
if colon != '>': if colon.string != '>':
return None return None
annotation, colon = self._parse_statement(added_breaks=[':']) annotation, colon = self._parse_statement(added_breaks=[':'])
if colon != ':': if colon.string != ':':
return None return None
# because of 2 line func param definitions # because of 2 line func param definitions
@@ -248,22 +247,22 @@ class Parser(object):
:rtype: Class :rtype: Class
""" """
first_pos = self.start_pos first_pos = self.start_pos
token_type, cname = self.next() cname = self.next()
if token_type != tokenize.NAME: if cname.type != tokenize.NAME:
debug.warning("class: syntax err, token is not a name@%s (%s: %s)", debug.warning("class: syntax err, token is not a name@%s (%s: %s)",
self.start_pos[0], tokenize.tok_name[token_type], cname) self.start_pos[0], tokenize.tok_name[cname.type], cname.string)
return None return None
cname = pr.Name(self.module, [(cname, self.start_pos)], self.start_pos, cname = pr.Name(self.module, [(cname.string, self.start_pos)],
self.end_pos) self.start_pos, self.end_pos)
super = [] super = []
token_type, _next = self.next() _next = self.next()
if _next == '(': if _next.string == '(':
super = self._parse_parentheses() super = self._parse_parentheses()
token_type, _next = self.next() _next = self.next()
if _next != ':': if _next.string != ':':
debug.warning("class syntax: %s@%s", cname, self.start_pos[0]) debug.warning("class syntax: %s@%s", cname, self.start_pos[0])
return None return None
@@ -288,14 +287,14 @@ class Parser(object):
level = 0 # The level of parentheses level = 0 # The level of parentheses
if pre_used_token: if pre_used_token:
token_type, tok = pre_used_token tok = pre_used_token
else: else:
token_type, tok = self.next() tok = self.next()
while token_type == tokenize.COMMENT: while tok.type == tokenize.COMMENT:
# remove newline and comment # remove newline and comment
self.next() self.next()
token_type, tok = self.next() tok = self.next()
first_pos = self.start_pos first_pos = self.start_pos
opening_brackets = ['{', '(', '['] opening_brackets = ['{', '(', '[']
@@ -314,9 +313,9 @@ class Parser(object):
tok_list = [] tok_list = []
as_names = [] as_names = []
while not (tok in always_break while not (tok.string in always_break
or tok in not_first_break and not tok_list or tok.string in not_first_break and not tok_list
or tok in breaks and level <= 0): or tok.string in breaks and level <= 0):
try: try:
# print 'parse_stmt', tok, tokenize.tok_name[token_type] # print 'parse_stmt', tok, tokenize.tok_name[token_type]
tok_list.append( tok_list.append(
@@ -324,10 +323,10 @@ class Parser(object):
self._current + (self.start_pos,) self._current + (self.start_pos,)
) )
) )
if tok == 'as': if tok.string == 'as':
token_type, tok = self.next() tok = self.next()
if token_type == tokenize.NAME: if tok.type == tokenize.NAME:
n, token_type, tok = self._parse_dot_name( n, tok = self._parse_dot_name(
self._current self._current
) )
if n: if n:
@@ -335,23 +334,23 @@ class Parser(object):
as_names.append(n) as_names.append(n)
tok_list.append(n) tok_list.append(n)
continue continue
elif tok in ['lambda', 'for', 'in']: elif tok.string in ['lambda', 'for', 'in']:
# don't parse these keywords, parse later in stmt. # don't parse these keywords, parse later in stmt.
if tok == 'lambda': if tok.string == 'lambda':
breaks.discard(':') breaks.discard(':')
elif token_type == tokenize.NAME: elif tok.type == tokenize.NAME:
n, token_type, tok = self._parse_dot_name(self._current) n, tok = self._parse_dot_name(self._current)
# removed last entry, because we add Name # removed last entry, because we add Name
tok_list.pop() tok_list.pop()
if n: if n:
tok_list.append(n) tok_list.append(n)
continue continue
elif tok in opening_brackets: elif tok.string in opening_brackets:
level += 1 level += 1
elif tok in closing_brackets: elif tok.string in closing_brackets:
level -= 1 level -= 1
token_type, tok = self.next() tok = self.next()
except (StopIteration, common.MultiLevelStopIteration): except (StopIteration, common.MultiLevelStopIteration):
# comes from tokenizer # comes from tokenizer
break break
@@ -387,7 +386,7 @@ class Parser(object):
stmt.parent = self._top_module stmt.parent = self._top_module
self._check_user_stmt(stmt) self._check_user_stmt(stmt)
if tok in always_break + not_first_break: if tok.string in always_break + not_first_break:
self._gen.push_last_back() self._gen.push_last_back()
return stmt, tok return stmt, tok
@@ -399,11 +398,13 @@ class Parser(object):
def __next__(self): def __next__(self):
""" Generate the next tokenize pattern. """ """ Generate the next tokenize pattern. """
typ, tok, start_pos, end_pos = next(self._gen) #typ, tok, start_pos, end_pos = next(self._gen)
self._current = next(self._gen)
# dedents shouldn't change positions # dedents shouldn't change positions
self.start_pos = start_pos self.start_pos = self._current.start
self.end_pos = self._current.end
self._current = typ, tok #self._current = typ, tok
return self._current return self._current
def _parse(self): def _parse(self):
@@ -422,9 +423,11 @@ class Parser(object):
self._decorators = [] self._decorators = []
self.freshscope = True self.freshscope = True
self.iterator = iter(self)
# This iterator stuff is not intentional. It grew historically. # This iterator stuff is not intentional. It grew historically.
for token_type, tok in self.iterator: self.iterator = iter(self)
for tok in self.iterator:
token_type = tok.type
tok_str = tok.string
self.module.temp_used_names = [] self.module.temp_used_names = []
# debug.dbg('main: tok=[%s] type=[%s] indent=[%s]', \ # debug.dbg('main: tok=[%s] type=[%s] indent=[%s]', \
# tok, tokenize.tok_name[token_type], start_position[0]) # tok, tokenize.tok_name[token_type], start_position[0])
@@ -433,7 +436,7 @@ class Parser(object):
# errors. only check for names, because thats relevant here. If # errors. only check for names, because thats relevant here. If
# some docstrings are not indented, I don't care. # some docstrings are not indented, I don't care.
while self.start_pos[1] <= self._scope.start_pos[1] \ while self.start_pos[1] <= self._scope.start_pos[1] \
and (token_type == tokenize.NAME or tok in ['(', '['])\ and (token_type == tokenize.NAME or tok_str in ['(', '['])\
and self._scope != self.module: and self._scope != self.module:
self._scope.end_pos = self.start_pos self._scope.end_pos = self.start_pos
self._scope = self._scope.parent self._scope = self._scope.parent
@@ -446,7 +449,7 @@ class Parser(object):
else: else:
use_as_parent_scope = self._scope use_as_parent_scope = self._scope
first_pos = self.start_pos first_pos = self.start_pos
if tok == 'def': if tok_str == 'def':
func = self._parse_function() func = self._parse_function()
if func is None: if func is None:
debug.warning("function: syntax error@%s", self.start_pos[0]) debug.warning("function: syntax error@%s", self.start_pos[0])
@@ -454,7 +457,7 @@ class Parser(object):
self.freshscope = True self.freshscope = True
self._scope = self._scope.add_scope(func, self._decorators) self._scope = self._scope.add_scope(func, self._decorators)
self._decorators = [] self._decorators = []
elif tok == 'class': elif tok_str == 'class':
cls = self._parse_class() cls = self._parse_class()
if cls is None: if cls is None:
debug.warning("class: syntax error@%s" % self.start_pos[0]) debug.warning("class: syntax error@%s" % self.start_pos[0])
@@ -463,7 +466,7 @@ class Parser(object):
self._scope = self._scope.add_scope(cls, self._decorators) self._scope = self._scope.add_scope(cls, self._decorators)
self._decorators = [] self._decorators = []
# import stuff # import stuff
elif tok == 'import': elif tok_str == 'import':
imports = self._parse_import_list() imports = self._parse_import_list()
for count, (m, alias, defunct) in enumerate(imports): for count, (m, alias, defunct) in enumerate(imports):
e = (alias or m or self).end_pos e = (alias or m or self).end_pos
@@ -477,25 +480,26 @@ class Parser(object):
defunct=True) defunct=True)
self._check_user_stmt(i) self._check_user_stmt(i)
self.freshscope = False self.freshscope = False
elif tok == 'from': elif tok_str == 'from':
defunct = False defunct = False
# take care for relative imports # take care for relative imports
relative_count = 0 relative_count = 0
while True: while True:
token_type, tok = self.next() tok = self.next()
if tok != '.': if tok.string != '.':
break break
relative_count += 1 relative_count += 1
# the from import # the from import
mod, token_type, tok = self._parse_dot_name(self._current) mod, tok = self._parse_dot_name(self._current)
tok_str = tok.string
if str(mod) == 'import' and relative_count: if str(mod) == 'import' and relative_count:
self._gen.push_last_back() self._gen.push_last_back()
tok = 'import' tok_str = 'import'
mod = None mod = None
if not mod and not relative_count or tok != "import": if not mod and not relative_count or tok_str != "import":
debug.warning("from: syntax error@%s", self.start_pos[0]) debug.warning("from: syntax error@%s", self.start_pos[0])
defunct = True defunct = True
if tok != 'import': if tok_str != 'import':
self._gen.push_last_back() self._gen.push_last_back()
names = self._parse_import_list() names = self._parse_import_list()
for count, (name, alias, defunct2) in enumerate(names): for count, (name, alias, defunct2) in enumerate(names):
@@ -511,10 +515,10 @@ class Parser(object):
self._scope.add_import(i) self._scope.add_import(i)
self.freshscope = False self.freshscope = False
# loops # loops
elif tok == 'for': elif tok_str == 'for':
set_stmt, tok = self._parse_statement(added_breaks=['in'], set_stmt, tok = self._parse_statement(added_breaks=['in'],
names_are_set_vars=True) names_are_set_vars=True)
if tok != 'in': if tok.string != 'in':
debug.warning('syntax err, for flow incomplete @%s', self.start_pos[0]) debug.warning('syntax err, for flow incomplete @%s', self.start_pos[0])
try: try:
@@ -524,23 +528,23 @@ class Parser(object):
s = [] if statement is None else [statement] s = [] if statement is None else [statement]
f = pr.ForFlow(self.module, s, first_pos, set_stmt) f = pr.ForFlow(self.module, s, first_pos, set_stmt)
self._scope = self._scope.add_statement(f) self._scope = self._scope.add_statement(f)
if tok != ':': if tok is None or tok.string != ':':
debug.warning('syntax err, for flow started @%s', self.start_pos[0]) debug.warning('syntax err, for flow started @%s', self.start_pos[0])
elif tok in ['if', 'while', 'try', 'with'] + extended_flow: elif tok_str in ['if', 'while', 'try', 'with'] + extended_flow:
added_breaks = [] added_breaks = []
command = tok command = tok_str
if command in ['except', 'with']: if command in ['except', 'with']:
added_breaks.append(',') added_breaks.append(',')
# multiple inputs because of with # multiple inputs because of with
inputs = [] inputs = []
first = True first = True
while first or command == 'with' and tok not in [':', '\n']: while first or command == 'with' and tok.string not in [':', '\n']:
statement, tok = \ statement, tok = \
self._parse_statement(added_breaks=added_breaks) self._parse_statement(added_breaks=added_breaks)
if command == 'except' and tok == ',': if command == 'except' and tok.string == ',':
# the except statement defines a var # the except statement defines a var
# this is only true for python 2 # this is only true for python 2
n, token_type, tok = self._parse_dot_name() n, tok = self._parse_dot_name()
if n: if n:
n.parent = statement n.parent = statement
statement.as_names.append(n) statement.as_names.append(n)
@@ -561,15 +565,15 @@ class Parser(object):
else: else:
s = self._scope.add_statement(f) s = self._scope.add_statement(f)
self._scope = s self._scope = s
if tok != ':': if tok.string != ':':
debug.warning('syntax err, flow started @%s', self.start_pos[0]) debug.warning('syntax err, flow started @%s', self.start_pos[0])
# returns # returns
elif tok in ['return', 'yield']: elif tok_str in ['return', 'yield']:
s = self.start_pos s = self.start_pos
self.freshscope = False self.freshscope = False
# add returns to the scope # add returns to the scope
func = self._scope.get_parent_until(pr.Function) func = self._scope.get_parent_until(pr.Function)
if tok == 'yield': if tok_str == 'yield':
func.is_generator = True func.is_generator = True
stmt, tok = self._parse_statement() stmt, tok = self._parse_statement()
@@ -582,7 +586,7 @@ class Parser(object):
except AttributeError: except AttributeError:
debug.warning('return in non-function') debug.warning('return in non-function')
# globals # globals
elif tok == 'global': elif tok_str == 'global':
stmt, tok = self._parse_statement(self._current) stmt, tok = self._parse_statement(self._current)
if stmt: if stmt:
self._scope.add_statement(stmt) self._scope.add_statement(stmt)
@@ -592,13 +596,13 @@ class Parser(object):
# important. # important.
self.module.add_global(t) self.module.add_global(t)
# decorator # decorator
elif tok == '@': elif tok_str == '@':
stmt, tok = self._parse_statement() stmt, tok = self._parse_statement()
if stmt is not None: if stmt is not None:
self._decorators.append(stmt) self._decorators.append(stmt)
elif tok == 'pass': elif tok_str == 'pass':
continue continue
elif tok == 'assert': elif tok_str == 'assert':
stmt, tok = self._parse_statement() stmt, tok = self._parse_statement()
if stmt is not None: if stmt is not None:
stmt.parent = use_as_parent_scope stmt.parent = use_as_parent_scope
@@ -606,7 +610,7 @@ class Parser(object):
# default # default
elif token_type in [tokenize.NAME, tokenize.STRING, elif token_type in [tokenize.NAME, tokenize.STRING,
tokenize.NUMBER] \ tokenize.NUMBER] \
or tok in statement_toks: or tok_str in statement_toks:
# this is the main part - a name can be a function or a # this is the main part - a name can be a function or a
# normal var, which can follow anything. but this is done # normal var, which can follow anything. but this is done
# by the statement parser. # by the statement parser.
@@ -616,7 +620,7 @@ class Parser(object):
self.freshscope = False self.freshscope = False
else: else:
if token_type not in [tokenize.COMMENT, tokenize.NEWLINE]: if token_type not in [tokenize.COMMENT, tokenize.NEWLINE]:
debug.warning('Token not used: %s %s %s', tok, debug.warning('Token not used: %s %s %s', tok_str,
tokenize.tok_name[token_type], self.start_pos) tokenize.tok_name[token_type], self.start_pos)
continue continue
self.no_docstr = False self.no_docstr = False