parsing refactorings, because the errors were not catched the right way

2026-05-19 23:09:43 +08:00 · 2012-09-14 21:00:57 +02:00
parent b073440369
commit fd37ae53f2
1 changed files with 135 additions and 124 deletions
@@ -1122,7 +1122,6 @@ class PyFuzzyParser(object):

        # Stuff to fix tokenize errors. The parser is pretty good in tolerating
        # any errors of tokenize and just parse ahead.
-        self._tokenize_start_pos = 0
        self._line_of_tokenize_restart = 0

        self.parse()
@@ -1384,114 +1383,123 @@ class PyFuzzyParser(object):

        tok_list = []
        while not (tok in always_break or tok in breaks and level <= 0):
-            set_string = None
-            #print 'parse_stmt', tok, tokenize.tok_name[token_type]
-            tok_list.append(self.current + (self.start_pos,))
-            if tok == 'as':
-                string += " %s " % tok
-                token_type, tok = self.next()
-                if token_type == tokenize.NAME:
-                    n, token_type, tok = self._parsedotname(self.current)
-                    if n:
-                        set_vars.append(n)
-                    tok_list.append(n)
-                    string += ".".join(n.names)
-                continue
-            elif token_type == tokenize.NAME:
-                if tok in ['return', 'yield', 'del', 'raise', 'assert']:
-                    if len(tok_list) > 1:
-                        # this happens, when a statement has opening brackets,
-                        # which are not closed again, here I just start a new
-                        # statement. This is a hack, but I could not come up
-                        # with a better solution.
-                        # This is basically a reset of the statement.
-                        debug.warning('keyword in statement %s@%s', tok_list,
-                                                            self.start_pos[0])
-                        tok_list = [self.current + (self.start_pos,)]
-                        set_vars = []
-                        used_funcs = []
-                        used_vars = []
-                        level = 0
-                    set_string = tok + ' '
-                    if tok in ['return', 'yield']:
-                        is_return = tok
-                elif tok == 'for':
-                    # list comprehensions!
-                    middle, tok = self._parse_statement(added_breaks=['in'])
-                    if tok != 'in' or middle is None:
-                        if middle is None:
-                            level -= 1
-                        debug.warning('list comprehension formatting @%s' %
-                                                            self.start_pos[0])
-                        continue
-
-                    b = [')', ']']
-                    in_clause, tok = self._parse_statement(added_breaks=b,
-                                                            list_comp=True)
-                    if tok not in b or in_clause is None:
-                        if in_clause is None:
-                            self.gen.push_back(self._current_full)
-                        debug.warning('list comprehension in_clause %s@%s' %
-                                                    (tok, self.start_pos[0]))
-                        continue
-                    other_level = 0
-
-                    for i, tok in enumerate(reversed(tok_list)):
-                        if not isinstance(tok, Name):
-                            tok = tok[1]
-                        if tok in closing_brackets:
-                            other_level -= 1
-                        elif tok in opening_brackets:
-                            other_level += 1
-                        if other_level > 0:
-                            break
-                    else:
-                        i = 0  # could not detect brackets -> nested list comp
-
-                    tok_list, toks = tok_list[:-i], tok_list[-i:-1]
-                    src = ''
-                    for t in toks:
-                        src += t[1] if isinstance(t, tuple) else t.get_code()
-                    st = Statement(src, [], [], [], \
-                                    toks, first_pos, self.end_pos)
-
-                    for s in [st, middle, in_clause]:
-                        s.parent = weakref.ref(self.scope)
-                    tok = ListComprehension(st, middle, in_clause)
-                    tok_list.append(tok)
-                    if list_comp:
-                        string = ''
-                    string += tok.get_code()
-                    continue
-                else:
-                    n, token_type, tok = self._parsedotname(self.current)
-                    tok_list.pop()  # removed last entry, because we add Name
-                    if n:
+            try:
+                set_string = None
+                #print 'parse_stmt', tok, tokenize.tok_name[token_type]
+                tok_list.append(self.current + (self.start_pos,))
+                if tok == 'as':
+                    string += " %s " % tok
+                    token_type, tok = self.next()
+                    if token_type == tokenize.NAME:
+                        n, token_type, tok = self._parsedotname(self.current)
+                        if n:
+                            set_vars.append(n)
                        tok_list.append(n)
-                        if tok == '(':
-                            # it must be a function
-                            used_funcs.append(n)
-                        else:
-                            used_vars.append(n)
-                        if string and re.match(r'[\w\d\'"]', string[-1]):
-                            string += ' '
                        string += ".".join(n.names)
                    continue
-            elif '=' in tok and not tok in ['>=', '<=', '==', '!=']:
-                # there has been an assignement -> change vars
-                if level == 0:
-                    set_vars = used_vars
-                    used_vars = []
-            elif tok in opening_brackets:
-                level += 1
-            elif tok in closing_brackets:
-                level -= 1
+                elif token_type == tokenize.NAME:
+                    if tok in ['return', 'yield', 'del', 'raise', 'assert']:
+                        if len(tok_list) > 1:
+                            # this happens, when a statement has opening
+                            # brackets, which are not closed again, here I just
+                            # start a new statement. This is a hack, but I
+                            # could not come up with a better solution.
+                            # This is basically a reset of the statement.
+                            debug.warning('keyword in statement %s@%s',
+                                            tok_list, self.start_pos[0])
+                            tok_list = [self.current + (self.start_pos,)]
+                            set_vars = []
+                            used_funcs = []
+                            used_vars = []
+                            level = 0
+                        set_string = tok + ' '
+                        if tok in ['return', 'yield']:
+                            is_return = tok
+                    elif tok == 'for':
+                        # list comprehensions!
+                        middle, tok = self._parse_statement(
+                                                        added_breaks=['in'])
+                        if tok != 'in' or middle is None:
+                            if middle is None:
+                                level -= 1
+                            debug.warning('list comprehension formatting @%s' %
+                                                            self.start_pos[0])
+                            continue
+
+                        b = [')', ']']
+                        in_clause, tok = self._parse_statement(added_breaks=b,
+                                                                list_comp=True)
+                        if tok not in b or in_clause is None:
+                            if in_clause is None:
+                                self.gen.push_back(self._current_full)
+                            debug.warning('list comprehension in_clause %s@%s'
+                                                % (tok, self.start_pos[0]))
+                            continue
+                        other_level = 0
+
+                        for i, tok in enumerate(reversed(tok_list)):
+                            if not isinstance(tok, Name):
+                                tok = tok[1]
+                            if tok in closing_brackets:
+                                other_level -= 1
+                            elif tok in opening_brackets:
+                                other_level += 1
+                            if other_level > 0:
+                                break
+                        else:
+                            # could not detect brackets -> nested list comp
+                            i = 0
+
+                        tok_list, toks = tok_list[:-i], tok_list[-i:-1]
+                        src = ''
+                        for t in toks:
+                            src += t[1] if isinstance(t, tuple) \
+                                        else t.get_code()
+                        st = Statement(src, [], [], [], \
+                                        toks, first_pos, self.end_pos)
+
+                        for s in [st, middle, in_clause]:
+                            s.parent = weakref.ref(self.scope)
+                        tok = ListComprehension(st, middle, in_clause)
+                        tok_list.append(tok)
+                        if list_comp:
+                            string = ''
+                        string += tok.get_code()
+                        continue
+                    else:
+                        n, token_type, tok = self._parsedotname(self.current)
+                        # removed last entry, because we add Name
+                        tok_list.pop()
+                        if n:
+                            tok_list.append(n)
+                            if tok == '(':
+                                # it must be a function
+                                used_funcs.append(n)
+                            else:
+                                used_vars.append(n)
+                            if string and re.match(r'[\w\d\'"]', string[-1]):
+                                string += ' '
+                            string += ".".join(n.names)
+                        continue
+                elif '=' in tok and not tok in ['>=', '<=', '==', '!=']:
+                    # there has been an assignement -> change vars
+                    if level == 0:
+                        set_vars = used_vars
+                        used_vars = []
+                elif tok in opening_brackets:
+                    level += 1
+                elif tok in closing_brackets:
+                    level -= 1
+
+                if set_string is not None:
+                    string = set_string
+                else:
+                    string += tok
+                token_type, tok = self.next()
+            except StopIteration:
+                # comes from tokenizer
+                break

-            if set_string is not None:
-                string = set_string
-            else:
-                string += tok
-            token_type, tok = self.next()
        if not string:
            return None, tok
        #print 'new_stat', string, set_vars, used_funcs, used_vars
@@ -1526,7 +1534,24 @@ class PyFuzzyParser(object):

    def next(self):
        """ Generate the next tokenize pattern. """
-        self._current_full = next(self.gen)
+        try:
+            self._current_full = next(self.gen)
+        except tokenize.TokenError:
+            # We just ignore this error, I try to handle it earlier - as
+            # good as possible
+            debug.warning('parentheses not closed error')
+        except IndentationError:
+            # This is an error, that tokenize may produce, because the code
+            # is not indented as it should. Here it just ignores this line
+            # and restarts the parser.
+            # (This is a rather unlikely error message, for normal code,
+            # tokenize seems to be pretty tolerant)
+            debug.warning('indentation error on line %s, ignoring it' %
+                                                    (self.start_pos[0]))
+            self._line_of_tokenize_restart = self.start_pos[0] + 1
+            self.gen = PushBackIterator(tokenize_func(self.buf.readline))
+            return self.next()
+
        type, tok, self._tokenize_start_pos, self._tokenize_end_pos, \
                            self.parserline = self._current_full
        if self.user_position and (self.start_pos[0] == self.user_position[0]
@@ -1550,8 +1575,8 @@ class PyFuzzyParser(object):

        :raises: IndentationError
        """
-        buf = BytesIO(self.code)
-        self.gen = PushBackIterator(tokenize_func(buf.readline))
+        self.buf = BytesIO(self.code)
+        self.gen = PushBackIterator(tokenize_func(self.buf.readline))

        extended_flow = ['else', 'elif', 'except', 'finally']
        statement_toks = ['{', '[', '(', '`']
@@ -1725,20 +1750,6 @@ class PyFuzzyParser(object):
                                                            self.start_pos[0])
            except StopIteration:  # thrown on EOF
                break
-            except tokenize.TokenError:
-                # We just ignore this error, I try to handle it earlier - as
-                # good as possible
-                debug.warning('parentheses not closed error')
-            except IndentationError:
-                # This is an error, that tokenize may produce, because the code
-                # is not indented as it should. Here it just ignores this line
-                # and restarts the parser.
-                # (This is a rather unlikely error message, for normal code,
-                # tokenize seems to be pretty tolerant)
-                self._line_of_tokenize_restart = self.start_pos[0] + 1
-                self._tokenize_start_pos = (0, 0)
-                self._tokenize_end_pos = (0, 0)
-                debug.warning('indentation error on line %s, ignoring it' %
-                                                        (self.start_pos[0]))
-                self.gen = PushBackIterator(tokenize_func(buf.readline))
+
+        del self.buf
        return self.module