Simplify the fast parser tokenizer more. Now it is more readable and less buggy (+bugfixes).

2015-01-31 20:09:44 +01:00
parent 1826f432c8
commit bc118e8047
5 changed files with 44 additions and 65 deletions
--- a/jedi/evaluate/representation.py
+++ b/jedi/evaluate/representation.py
@@ -177,6 +177,7 @@ class Instance(use_metaclass(CachedMetaClass, Executed)):
        return names
    def get_subscope_by_name(self, name):
        print(name)
        sub = self.base.get_subscope_by_name(name)
        return get_instance_el(self._evaluator, self, sub, True)
--- a/jedi/parser/init.py
+++ b/jedi/parser/init.py
@@ -301,13 +301,13 @@ class Parser(object):
    def _tokenize(self, tokenizer):
        for typ, value, start_pos, prefix in tokenizer:
           # print(token.tok_name[typ], repr(value), start_pos, repr(prefix))
            if self._omit_dedent and typ == token.DEDENT:
                self._omit_dedent -= 1
                continue
            if typ == token.OP:
                typ = token.opmap[value]
            print(token.tok_name[typ], repr(value), start_pos, repr(prefix))
            yield typ, value, prefix, start_pos
    def __repr__(self):
@@ -322,7 +322,6 @@ class Parser(object):
        endmarker = self.module.children[-1]
        # The newline is either in the endmarker as a prefix or the previous
        # leaf as a newline token.
        print('REMOVE', endmarker.start_pos)
        if endmarker.prefix.endswith('\n'):
            endmarker.prefix = endmarker.prefix[:-1]
            last_line = re.sub('.*\n', '', endmarker.prefix)
--- a/jedi/parser/fast.py
+++ b/jedi/parser/fast.py
@@ -12,6 +12,7 @@ from jedi.parser import Parser
 from jedi.parser import tree as pr
 from jedi.parser import tokenize
 from jedi import cache
 from jedi import debug
 from jedi.parser.tokenize import (source_tokens, NEWLINE,
                                  ENDMARKER, INDENT, DEDENT)
@@ -155,7 +156,6 @@ class ParserNode(object):
        self._node_children = []
        scope = self._content_scope
        scope.children = list(self._old_children)
        print('reset', scope.children)
        try:
            # This works if it's a MergedNamesDict.
            # We are correcting it, because the MergedNamesDicts are artificial
@@ -187,14 +187,11 @@ class ParserNode(object):
        nodes should be added anymore.
        """
        print('CLOSE NODE', id(self), self.parent, self._node_children)
        print(self.parser.module.names_dict, [p.parser.module.names_dict for p in
    self._node_children])
        # We only need to replace the dict if multiple dictionaries are used:
        if self._node_children:
            dcts = [n.parser.module.names_dict for n in self._node_children]
            # Need to insert the own node as well.
            dcts.insert(0, self._content_scope.names_dict)
            print('DCTS', self.parser, dcts, self._node_children)
            self._content_scope.names_dict = MergedNamesDict(dcts)
    def parent_until_indent(self, indent=None):
@@ -411,19 +408,15 @@ class FastParser(use_metaclass(CachedFastParser)):
                print('OFF', line_offset + 1, self.current_node.parser.module.end_pos)
                #import pdb; pdb.set_trace()
            if is_first or line_offset + 1 == self.current_node.parser.module.end_pos[0]:
                print(repr(code_part))
                indent = len(code_part) - len(code_part.lstrip('\t '))
                self.current_node = self.current_node.parent_until_indent(indent)
-                print('cur', id(self.current_node))
+                # print '#'*45,line_offset, p.module.end_pos, '\n', code_part
                # check if code_part has already been parsed
                # print '#'*45,line_offset, p and p.module.end_pos, '\n', code_part
                self.current_node = self._get_node(code_part, source[start:],
                                                   line_offset, nodes, not is_first)
                if False and is_first and self.current_node.parser.module.subscopes:
                    print('NOXXXX')
                    raise NotImplementedError
                    # Special case, we cannot use a function subscope as a
                    # base scope, subscopes would save all the other contents
@@ -474,6 +467,9 @@ class FastParser(use_metaclass(CachedFastParser)):
        """ TODO used?
        self.module.end_pos = self.parsers[-1].module.end_pos
        """
        debug.dbg('Parsed %s, with %s parsers in %s splits.' 
                  % (self.module_path, self.number_parsers_used,
                     self.number_of_splits))
        # print(self.parsers[0].module.get_code())
@@ -483,7 +479,7 @@ class FastParser(use_metaclass(CachedFastParser)):
        """
        h = hash(source)
        for index, node in enumerate(nodes):
-            print('EQ', node, repr(node.source), repr(source))
+            #print('EQ', node, repr(node.source), repr(source))
            if node.hash == h and node.source == source:
                node.reset_node()
                nodes.remove(node)
@@ -497,8 +493,7 @@ class FastParser(use_metaclass(CachedFastParser)):
            node = ParserNode(self.module)
            end = line_offset + p.module.end_pos[0]
-            print('\nACTUALLY PARSING', p.module.end_pos, repr(source),
+            print('\nACTUALLY PARSING', p.module.end_pos, repr(source), len(self._lines), line_offset)
        len(self._lines), line_offset)
            if not (len(self._lines) == end):
                # We don't keep the last line, except if were done. A newline
                # ends on the next line, which is part of the next parser. But
@@ -525,15 +520,12 @@ class FastTokenizer(object):
        # fast parser options
        self.current = self.previous = NEWLINE, '', (0, 0)
        self._in_flow = False
        self._new_indent = False
        self._parser_indent = self._old_parser_indent = 0
        self._is_decorator = False
        self._first_stmt = True
        self._parentheses_level = 0
        self._indent_counter = 0
        self._flow_indent_counter = 0
        self._returned_endmarker = False
        self._next_dedent_noclose = False
    def __iter__(self):
        return self
@@ -555,65 +547,35 @@ class FastTokenizer(object):
        self.previous = self.current
        self.current = current
        # this is exactly the same check as in fast_parser, but this time with
        # tokenize and therefore precise.
        breaks = ['def', 'class', '@']
        if typ == INDENT:
            self._indent_counter += 1
        elif typ == DEDENT:
            self._indent_counter -= 1
-            print('DEDENT', self._flow_indent_counter, start_pos, self._indent_counter)
+            print(self._in_flow, self._indent_counter, self._flow_indent_counter)
            if self._in_flow and self._indent_counter == self._flow_indent_counter:
                self._in_flow = False
-                self._next_dedent_noclose = True
+            elif not self._in_flow:
                self._closed = True
            return current
-        if self.previous[0] in (NEWLINE, INDENT, DEDENT):
+        # Parentheses ignore the indentation rules. The other three stand for
-            if self.previous[0] == DEDENT:
+        # new lines.
-                if not self._in_flow:
+        if self.previous[0] in (NEWLINE, INDENT, DEDENT) \
-                    if not self._next_dedent_noclose:
+                and not self._parentheses_level:
                        self._first_stmt = False
                        return self._close()
                    self._next_dedent_noclose = False
            # Check for NEWLINE, which symbolizes the indent.
            #print('X', repr(value), tokenize.tok_name[typ])
            indent = start_pos[1]
            #print(indent, self._parser_indent)
            if self._parentheses_level:
                # Parentheses ignore the indentation rules.
                pass
            elif False and indent < self._parser_indent:  # -> dedent
                raise NotImplementedError
                self._parser_indent = indent
                self._new_indent = False
                print(self._in_flow, indent, self._old_parser_indent)
                if not self._in_flow or indent < self._old_parser_indent:
                    return self._close()
                self._in_flow = False
            elif self._new_indent:
                self._parser_indent = indent
                self._new_indent = False
            if not self._in_flow:
                self._in_flow = value in FLOWS
                if self._in_flow:
                    print('INFLOW', self._indent_counter)
                    self._flow_indent_counter = self._indent_counter
-                    #self._old_parser_indent = self._parser_indent
+                elif value in ('def', 'class', '@'):
-                    #self._parser_indent += 1  # new scope: must be higher
+                    # The values here are exactly the same check as in
-                    #self._new_indent = True
+                    # _split_parts, but this time with tokenize and therefore
-                elif value in breaks:
+                    # precise.
                    if not self._is_decorator:
                        return self._close()
                    self._is_decorator = '@' == value
                    #if not self._is_decorator:
                        #self._old_parser_indent = self._parser_indent
                        #self._parser_indent += 1  # new scope: must be higher
                        #self._new_indent = True
        if value in '([{' and value:
            self._parentheses_level += 1
@@ -628,8 +590,6 @@ class FastTokenizer(object):
            # Continue like nothing has happened, because we want to enter
            # the first class/function.
            if self.current[1] != '@':
                #if self._first_stmt and not self._new_indent:
                    #self._parser_indent = indent
                self._first_stmt = False
            return self.current
        else:
@@ -637,18 +597,19 @@ class FastTokenizer(object):
            return self._finish_dedents()
    def _finish_dedents(self):
        start_pos = self.current[2]
        print('FINISH', self._indent_counter)
        if self._indent_counter:
            self._indent_counter -= 1
-            return tokenize.DEDENT, '', start_pos, ''
+            return tokenize.DEDENT, '', self.current[2], ''
        elif not self._returned_endmarker:
            self._returned_endmarker = True
            # We're using the current prefix for the endmarker to not loose any
            # information. However we care about "lost" lines. The prefix of
            # the current line (indent) will always be included in the current
            # line.
-            prefix = re.sub('[^\n]+$', '', self.current[3])
+            t, _, start_pos, prefix = next(self._gen)
            # \Z for the end of the string. $ is bugged, because it has the
            # same behavior with or without re.MULTILINE.
            prefix = re.sub(r'[^\n]+\Z', '', prefix)
            return ENDMARKER, '', start_pos, prefix
        else:
            raise StopIteration
--- a/jedi/parser/tree.py
+++ b/jedi/parser/tree.py
@@ -706,7 +706,7 @@ class Class(ClassOrFunc):
        super(Class, self).__init__(children)
    def get_super_arglist(self):
-        if len(self.children) == 4:  # Has no parentheses
+        if self.children[2] != '(':  # Has no parentheses
            return None
        else:
            if self.children[3] == ')':  # Empty parentheses
--- a/test/test_parser/test_fast_parser.py
+++ b/test/test_parser/test_fast_parser.py
@@ -180,6 +180,24 @@ def test_nested_funcs():
    check_fp(src, 3)
 def test_class_and_if():
    src = dedent("""\
    class V:
        def __init__(self):
            pass
        if 1:
            c = 3
    def a_func():
        return 1
    # COMMENT
    a_func()""")
    check_fp(src, 5, 5)
    assert [d.name for d in jedi.Script(src).goto_definitions()] == ['int']
 def test_func_with_for_and_comment():
    # The first newline is important, leave it. It should not trigger another
    # parser split.