Add a new parser, check it pgen2 would work. (already modified outside this repository)

2026-03-01 12:57:17 +08:00 · 2014-10-08 14:26:52 +02:00
parent 09a7317bc9
commit 834172a3e9
9 changed files with 2099 additions and 0 deletions
--- a/jedi/parser/grammar.txt
+++ b/jedi/parser/grammar.txt
@@ -0,0 +1,158 @@
 # Grammar for 2to3. This grammar supports Python 2.x and 3.x.
 # Note:  Changing the grammar specified in this file will most likely
 #        require corresponding changes in the parser module
 #        (../Modules/parsermodule.c).  If you can't make the changes to
 #        that module yourself, please co-ordinate the required changes
 #        with someone who can; ask around on python-dev for help.  Fred
 #        Drake <fdrake@acm.org> will probably be listening there.
 # NOTE WELL: You should also follow all the steps listed in PEP 306,
 # "How to Change Python's Grammar"
 # Commands for Kees Blom's railroad program
 #diagram:token NAME
 #diagram:token NUMBER
 #diagram:token STRING
 #diagram:token NEWLINE
 #diagram:token ENDMARKER
 #diagram:token INDENT
 #diagram:output\input python.bla
 #diagram:token DEDENT
 #diagram:output\textwidth 20.04cm\oddsidemargin  0.0cm\evensidemargin 0.0cm
 #diagram:rules
 # Start symbols for the grammar:
 #	file_input is a module or sequence of commands read from an input file;
 #	single_input is a single interactive statement;
 #	eval_input is the input for the eval() and input() functions.
 # NB: compound_stmt in single_input is followed by extra NEWLINE!
 file_input: (NEWLINE | stmt)* ENDMARKER
 single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
 eval_input: testlist NEWLINE* ENDMARKER
 decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
 decorators: decorator+
 decorated: decorators (classdef | funcdef)
 funcdef: 'def' NAME parameters ['->' test] ':' suite
 parameters: '(' [typedargslist] ')'
 typedargslist: ((tfpdef ['=' test] ',')*
                ('*' [tname] (',' tname ['=' test])* [',' '**' tname] | '**' tname)
                | tfpdef ['=' test] (',' tfpdef ['=' test])* [','])
 tname: NAME [':' test]
 tfpdef: tname | '(' tfplist ')'
 tfplist: tfpdef (',' tfpdef)* [',']
 varargslist: ((vfpdef ['=' test] ',')*
              ('*' [vname] (',' vname ['=' test])*  [',' '**' vname] | '**' vname)
              | vfpdef ['=' test] (',' vfpdef ['=' test])* [','])
 vname: NAME
 vfpdef: vname | '(' vfplist ')'
 vfplist: vfpdef (',' vfpdef)* [',']
 stmt: simple_stmt | compound_stmt
 simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
 small_stmt: (expr_stmt | print_stmt  | del_stmt | pass_stmt | flow_stmt |
             import_stmt | global_stmt | exec_stmt | assert_stmt)
 expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) |
                     ('=' (yield_expr|testlist_star_expr))*)
 testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
 augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
            '<<=' | '>>=' | '**=' | '//=')
 # For normal assignments, additional restrictions enforced by the interpreter
 print_stmt: 'print' ( [ test (',' test)* [','] ] |
                      '>>' test [ (',' test)+ [','] ] )
 del_stmt: 'del' exprlist
 pass_stmt: 'pass'
 flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
 break_stmt: 'break'
 continue_stmt: 'continue'
 return_stmt: 'return' [testlist]
 yield_stmt: yield_expr
 raise_stmt: 'raise' [test ['from' test | ',' test [',' test]]]
 import_stmt: import_name | import_from
 import_name: 'import' dotted_as_names
 import_from: ('from' ('.'* dotted_name | '.'+)
              'import' ('*' | '(' import_as_names ')' | import_as_names))
 import_as_name: NAME ['as' NAME]
 dotted_as_name: dotted_name ['as' NAME]
 import_as_names: import_as_name (',' import_as_name)* [',']
 dotted_as_names: dotted_as_name (',' dotted_as_name)*
 dotted_name: NAME ('.' NAME)*
 global_stmt: ('global' | 'nonlocal') NAME (',' NAME)*
 exec_stmt: 'exec' expr ['in' test [',' test]]
 assert_stmt: 'assert' test [',' test]
 compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated
 if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
 while_stmt: 'while' test ':' suite ['else' ':' suite]
 for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
 try_stmt: ('try' ':' suite
           ((except_clause ':' suite)+
 	    ['else' ':' suite]
 	    ['finally' ':' suite] |
 	   'finally' ':' suite))
 with_stmt: 'with' with_item (',' with_item)*  ':' suite
 with_item: test ['as' expr]
 with_var: 'as' expr
 # NB compile.c makes sure that the default except clause is last
 except_clause: 'except' [test [(',' | 'as') test]]
 suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
 # Backward compatibility cruft to support:
 # [ x for x in lambda: True, lambda: False if x() ]
 # even while also allowing:
 # lambda x: 5 if x else 2
 # (But not a mix of the two)
 testlist_safe: old_test [(',' old_test)+ [',']]
 old_test: or_test | old_lambdef
 old_lambdef: 'lambda' [varargslist] ':' old_test
 test: or_test ['if' or_test 'else' test] | lambdef
 or_test: and_test ('or' and_test)*
 and_test: not_test ('and' not_test)*
 not_test: 'not' not_test | comparison
 comparison: expr (comp_op expr)*
 comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
 star_expr: '*' expr
 expr: xor_expr ('|' xor_expr)*
 xor_expr: and_expr ('^' and_expr)*
 and_expr: shift_expr ('&' shift_expr)*
 shift_expr: arith_expr (('<<'|'>>') arith_expr)*
 arith_expr: term (('+'|'-') term)*
 term: factor (('*'|'/'|'%'|'//') factor)*
 factor: ('+'|'-'|'~') factor | power
 power: atom trailer* ['**' factor]
 atom: ('(' [yield_expr|testlist_gexp] ')' |
       '[' [listmaker] ']' |
       '{' [dictsetmaker] '}' |
       '`' testlist1 '`' |
       NAME | NUMBER | STRING+ | '.' '.' '.')
 listmaker: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
 testlist_gexp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
 lambdef: 'lambda' [varargslist] ':' test
 trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
 subscriptlist: subscript (',' subscript)* [',']
 subscript: test | [test] ':' [test] [sliceop]
 sliceop: ':' [test]
 exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
 testlist: test (',' test)* [',']
 dictsetmaker: ( (test ':' test (comp_for | (',' test ':' test)* [','])) |
                (test (comp_for | (',' test)* [','])) )
 classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
 arglist: (argument ',')* (argument [',']
                         |'*' test (',' argument)* [',' '**' test] 
                         |'**' test)
 argument: test [comp_for] | test '=' test  # Really [keyword '='] test
 comp_iter: comp_for | comp_if
 comp_for: 'for' exprlist 'in' testlist_safe [comp_iter]
 comp_if: 'if' old_test [comp_iter]
 testlist1: test (',' test)*
 # not used in grammar, but may appear in "node" passed from Parser to Compiler
 encoding_decl: NAME
 yield_expr: 'yield' [testlist]
--- a/jedi/parser/pgen2/init.py
+++ b/jedi/parser/pgen2/init.py
@@ -0,0 +1,114 @@
 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
 # Licensed to PSF under a Contributor Agreement.
 # Modifications:
 # Copyright 2006 Google, Inc. All Rights Reserved.
 # Licensed to PSF under a Contributor Agreement.
 __all__ = ["Driver", "load_grammar"]
 import os
 import sys
 import logging
 import io
 from . import pgen
 from . import grammar
 from . import parse
 from . import token
 from . import tokenize
 class Driver(object):
    def __init__(self, grammar, convert=None, logger=None):
        self.grammar = grammar
        if logger is None:
            logger = logging.getLogger()
        self.logger = logger
        self.convert = convert
    def parse_tokens(self, tokens):
        """Parse a series of tokens and return the syntax tree."""
        # XXX Move the prefix computation into a wrapper around tokenize.
        p = parse.Parser(self.grammar, self.convert)
        lineno = 1
        column = 0
        type = value = start = end = line_text = None
        prefix = ""
        for quintuple in tokens:
            type, value, start, end, line_text = quintuple
            if start != (lineno, column):
                assert (lineno, column) <= start, ((lineno, column), start)
                s_lineno, s_column = start
                if lineno < s_lineno:
                    prefix += "\n" * (s_lineno - lineno)
                    lineno = s_lineno
                    column = 0
                if column < s_column:
                    prefix += line_text[column:s_column]
                    column = s_column
            if type in (tokenize.COMMENT, tokenize.NL):  # NL != NEWLINE
                prefix += value
                lineno, column = end
                if value.endswith("\n"):
                    lineno += 1
                    column = 0
                continue
            if type == token.OP:
                type = grammar.opmap[value]
            #self.logger.debug("%s %r (prefix=%r)", token.tok_name[type], value, prefix)
            if p.addtoken(type, value, (prefix, start)):
                break
            prefix = ""
            lineno, column = end
            if value.endswith("\n"):
                lineno += 1
                column = 0
        else:
            # We never broke out -- EOF is too soon (how can this happen???)
            raise parse.ParseError("incomplete input",
                                   type, value, (prefix, start))
        return p.rootnode
    def parse_string(self, text):
        """Parse a string and return the syntax tree."""
        tokens = tokenize.generate_tokens(io.StringIO(text).readline)
        return self.parse_tokens(tokens)
 def load_grammar(grammar_path="grammar.txt", pickle_path=None,
                 save=True, force=False, logger=None):
    """Load the grammar (maybe from a pickle)."""
    if logger is None:
        logger = logging.getLogger()
    if pickle_path is None:
        head, tail = os.path.splitext(grammar_path)
        if tail == ".txt":
            tail = ""
        pickle_path = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
    if force or not _newer(pickle_path, grammar_path):
        logger.info("Generating grammar tables from %s", grammar_path)
        g = pgen.generate_grammar(grammar_path)
        # the pickle files mismatch, when built on different architectures.
        # don't save these for now. An alternative solution might be to
        # include the multiarch triplet into the file name
        if False:
            logger.info("Writing grammar tables to %s", pickle_path)
            try:
                g.dump(pickle_path)
            except OSError as e:
                logger.info("Writing failed:" + str(e))
    else:
        g = grammar.Grammar()
        g.load(pickle_path)
    return g
 def _newer(a, b):
    """Inquire whether file a was written since file b."""
    if not os.path.exists(a):
        return False
    if not os.path.exists(b):
        return True
    return os.path.getmtime(a) >= os.path.getmtime(b)
--- a/jedi/parser/pgen2/grammar.py
+++ b/jedi/parser/pgen2/grammar.py
@@ -0,0 +1,182 @@
 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
 # Licensed to PSF under a Contributor Agreement.
 """This module defines the data structures used to represent a grammar.
 These are a bit arcane because they are derived from the data
 structures used by Python's 'pgen' parser generator.
 There's also a table here mapping operators to their names in the
 token module; the Python tokenize module reports all operators as the
 fallback token code OP, but the parser needs the actual token code.
 """
 # Python imports
 import pickle
 # Local imports
 from . import token
 class Grammar(object):
    """Pgen parsing tables conversion class.
    Once initialized, this class supplies the grammar tables for the
    parsing engine implemented by parse.py.  The parsing engine
    accesses the instance variables directly.  The class here does not
    provide initialization of the tables; several subclasses exist to
    do this (see the conv and pgen modules).
    The load() method reads the tables from a pickle file, which is
    much faster than the other ways offered by subclasses.  The pickle
    file is written by calling dump() (after loading the grammar
    tables using a subclass).  The report() method prints a readable
    representation of the tables to stdout, for debugging.
    The instance variables are as follows:
    symbol2number -- a dict mapping symbol names to numbers.  Symbol
                     numbers are always 256 or higher, to distinguish
                     them from token numbers, which are between 0 and
                     255 (inclusive).
    number2symbol -- a dict mapping numbers to symbol names;
                     these two are each other's inverse.
    states        -- a list of DFAs, where each DFA is a list of
                     states, each state is a list of arcs, and each
                     arc is a (i, j) pair where i is a label and j is
                     a state number.  The DFA number is the index into
                     this list.  (This name is slightly confusing.)
                     Final states are represented by a special arc of
                     the form (0, j) where j is its own state number.
    dfas          -- a dict mapping symbol numbers to (DFA, first)
                     pairs, where DFA is an item from the states list
                     above, and first is a set of tokens that can
                     begin this grammar rule (represented by a dict
                     whose values are always 1).
    labels        -- a list of (x, y) pairs where x is either a token
                     number or a symbol number, and y is either None
                     or a string; the strings are keywords.  The label
                     number is the index in this list; label numbers
                     are used to mark state transitions (arcs) in the
                     DFAs.
    start         -- the number of the grammar's start symbol.
    keywords      -- a dict mapping keyword strings to arc labels.
    tokens        -- a dict mapping token numbers to arc labels.
    """
    def __init__(self):
        self.symbol2number = {}
        self.number2symbol = {}
        self.states = []
        self.dfas = {}
        self.labels = [(0, "EMPTY")]
        self.keywords = {}
        self.tokens = {}
        self.symbol2label = {}
        self.start = 256
    def dump(self, filename):
        """Dump the grammar tables to a pickle file."""
        with open(filename, "wb") as f:
            pickle.dump(self.__dict__, f, 2)
    def load(self, filename):
        """Load the grammar tables from a pickle file."""
        with open(filename, "rb") as f:
            d = pickle.load(f)
        self.__dict__.update(d)
    def copy(self):
        """
        Copy the grammar.
        """
        new = self.__class__()
        for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
                          "tokens", "symbol2label"):
            setattr(new, dict_attr, getattr(self, dict_attr).copy())
        new.labels = self.labels[:]
        new.states = self.states[:]
        new.start = self.start
        return new
    def report(self):
        """Dump the grammar tables to standard output, for debugging."""
        from pprint import pprint
        print("s2n")
        pprint(self.symbol2number)
        print("n2s")
        pprint(self.number2symbol)
        print("states")
        pprint(self.states)
        print("dfas")
        pprint(self.dfas)
        print("labels")
        pprint(self.labels)
        print("start", self.start)
 # Map from operator to number (since tokenize doesn't do this)
 opmap_raw = """
 ( LPAR
 ) RPAR
 [ LSQB
 ] RSQB
 : COLON
 , COMMA
 ; SEMI
 + PLUS
 - MINUS
 * STAR
 / SLASH
 | VBAR
 & AMPER
 < LESS
 > GREATER
 = EQUAL
 . DOT
 % PERCENT
 ` BACKQUOTE
 { LBRACE
 } RBRACE
@ AT
 == EQEQUAL
 != NOTEQUAL
 <> NOTEQUAL
 <= LESSEQUAL
 >= GREATEREQUAL
 ~ TILDE
 ^ CIRCUMFLEX
 << LEFTSHIFT
 >> RIGHTSHIFT
 ** DOUBLESTAR
 += PLUSEQUAL
 -= MINEQUAL
 *= STAREQUAL
 /= SLASHEQUAL
 %= PERCENTEQUAL
 &= AMPEREQUAL
 |= VBAREQUAL
 ^= CIRCUMFLEXEQUAL
 <<= LEFTSHIFTEQUAL
 >>= RIGHTSHIFTEQUAL
 **= DOUBLESTAREQUAL
 // DOUBLESLASH
 //= DOUBLESLASHEQUAL
 -> RARROW
 """
 opmap = {}
 for line in opmap_raw.splitlines():
    if line:
        op, name = line.split()
        opmap[op] = getattr(token, name)
--- a/jedi/parser/pgen2/parse.py
+++ b/jedi/parser/pgen2/parse.py
@@ -0,0 +1,190 @@
 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
 # Licensed to PSF under a Contributor Agreement.
 """
 Parser engine for the grammar tables generated by pgen.
 The grammar table must be loaded first.
 See Parser/parser.c in the Python distribution for additional info on
 how this parsing engine works.
 """
 # Local imports
 from . import token
 class ParseError(Exception):
    """Exception to signal the parser is stuck."""
    def __init__(self, msg, type, value, context):
        Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
                           (msg, type, value, context))
        self.msg = msg
        self.type = type
        self.value = value
        self.context = context
 class Parser(object):
    """Parser engine.
    The proper usage sequence is:
    p = Parser(grammar, [converter])  # create instance
    p.setup([start])                  # prepare for parsing
    <for each input token>:
        if p.addtoken(...):           # parse a token; may raise ParseError
            break
    root = p.rootnode                 # root of abstract syntax tree
    A Parser instance may be reused by calling setup() repeatedly.
    A Parser instance contains state pertaining to the current token
    sequence, and should not be used concurrently by different threads
    to parse separate token sequences.
    See driver.py for how to get input tokens by tokenizing a file or
    string.
    Parsing is complete when addtoken() returns True; the root of the
    abstract syntax tree can then be retrieved from the rootnode
    instance variable.  When a syntax error occurs, addtoken() raises
    the ParseError exception.  There is no error recovery; the parser
    cannot be used after a syntax error was reported (but it can be
    reinitialized by calling setup()).
    """
    def __init__(self, grammar, convert=None):
        """Constructor.
        The grammar argument is a grammar.Grammar instance; see the
        grammar module for more information.
        The parser is not ready yet for parsing; you must call the
        setup() method to get it started.
        The optional convert argument is a function mapping concrete
        syntax tree nodes to abstract syntax tree nodes.  If not
        given, no conversion is done and the syntax tree produced is
        the concrete syntax tree.  If given, it must be a function of
        two arguments, the first being the grammar (a grammar.Grammar
        instance), and the second being the concrete syntax tree node
        to be converted.  The syntax tree is converted from the bottom
        up.
        A concrete syntax tree node is a (type, value, context, nodes)
        tuple, where type is the node type (a token or symbol number),
        value is None for symbols and a string for tokens, context is
        None or an opaque value used for error reporting (typically a
        (lineno, offset) pair), and nodes is a list of children for
        symbols, and None for tokens.
        An abstract syntax tree node may be anything; this is entirely
        up to the converter function.
        """
        self.grammar = grammar
        self.convert = convert or (lambda grammar, node: node)
        # Prepare for parsing.
        start = self.grammar.start
        # Each stack entry is a tuple: (dfa, state, node).
        # A node is a tuple: (type, value, context, children),
        # where children is a list of nodes or None, and context may be None.
        newnode = (start, None, None, [])
        stackentry = (self.grammar.dfas[start], 0, newnode)
        self.stack = [stackentry]
        self.rootnode = None
        self.used_names = set()  # Aliased to self.rootnode.used_names in pop()
    def addtoken(self, type, value, context):
        """Add a token; return True iff this is the end of the program."""
        # Map from token to label
        ilabel = self.classify(type, value, context)
        # Loop until the token is shifted; may raise exceptions
        while True:
            dfa, state, node = self.stack[-1]
            states, first = dfa
            arcs = states[state]
            # Look for a state with this label
            for i, newstate in arcs:
                t, v = self.grammar.labels[i]
                if ilabel == i:
                    # Look it up in the list of labels
                    assert t < 256
                    # Shift a token; we're done with it
                    self.shift(type, value, newstate, context)
                    # Pop while we are in an accept-only state
                    state = newstate
                    while states[state] == [(0, state)]:
                        self.pop()
                        if not self.stack:
                            # Done parsing!
                            return True
                        dfa, state, node = self.stack[-1]
                        states, first = dfa
                    # Done with this token
                    return False
                elif t >= 256:
                    # See if it's a symbol and if we're in its first set
                    itsdfa = self.grammar.dfas[t]
                    itsstates, itsfirst = itsdfa
                    if ilabel in itsfirst:
                        # Push a symbol
                        self.push(t, self.grammar.dfas[t], newstate, context)
                        break  # To continue the outer while loop
            else:
                if (0, state) in arcs:
                    # An accepting state, pop it and try something else
                    self.pop()
                    if not self.stack:
                        # Done parsing, but another token is input
                        raise ParseError("too much input",
                                         type, value, context)
                else:
                    # No success finding a transition
                    raise ParseError("bad input", type, value, context)
    def classify(self, type, value, context):
        """Turn a token into a label.  (Internal)"""
        if type == token.NAME:
            # Keep a listing of all used names
            self.used_names.add(value)
            # Check for reserved words
            ilabel = self.grammar.keywords.get(value)
            if ilabel is not None:
                return ilabel
        ilabel = self.grammar.tokens.get(type)
        if ilabel is None:
            raise ParseError("bad token", type, value, context)
        return ilabel
    def shift(self, type, value, newstate, context):
        """Shift a token.  (Internal)"""
        dfa, state, node = self.stack[-1]
        newnode = (type, value, context, None)
        newnode = self.convert(self.grammar, newnode)
        if newnode is not None:
            node[-1].append(newnode)
        self.stack[-1] = (dfa, newstate, node)
    def push(self, type, newdfa, newstate, context):
        """Push a nonterminal.  (Internal)"""
        dfa, state, node = self.stack[-1]
        newnode = (type, None, context, [])
        self.stack[-1] = (dfa, newstate, node)
        self.stack.append((newdfa, 0, newnode))
    def pop(self):
        """Pop a nonterminal.  (Internal)"""
        popdfa, popstate, popnode = self.stack.pop()
        newnode = self.convert(self.grammar, popnode)
        if newnode is not None:
            if self.stack:
                dfa, state, node = self.stack[-1]
                node[-1].append(newnode)
            else:
                self.rootnode = newnode
                self.rootnode.used_names = self.used_names
--- a/jedi/parser/pgen2/pgen.py
+++ b/jedi/parser/pgen2/pgen.py
@@ -0,0 +1,387 @@
 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
 # Licensed to PSF under a Contributor Agreement.
 # Pgen imports
 from . import grammar, token, tokenize
 class ParserGenerator(object):
    def __init__(self, filename, stream=None):
        close_stream = None
        if stream is None:
            stream = open(filename)
            close_stream = stream.close
        self.filename = filename
        self.stream = stream
        self.generator = tokenize.generate_tokens(stream.readline)
        self.gettoken()  # Initialize lookahead
        self.dfas, self.startsymbol = self.parse()
        if close_stream is not None:
            close_stream()
        self.first = {}  # map from symbol name to set of tokens
        self.addfirstsets()
    def make_grammar(self):
        c = grammar.Grammar()
        names = list(self.dfas.keys())
        names.sort()
        names.remove(self.startsymbol)
        names.insert(0, self.startsymbol)
        for name in names:
            i = 256 + len(c.symbol2number)
            c.symbol2number[name] = i
            c.number2symbol[i] = name
        for name in names:
            dfa = self.dfas[name]
            states = []
            for state in dfa:
                arcs = []
                for label, next in state.arcs.items():
                    arcs.append((self.make_label(c, label), dfa.index(next)))
                if state.isfinal:
                    arcs.append((0, dfa.index(state)))
                states.append(arcs)
            c.states.append(states)
            c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
        c.start = c.symbol2number[self.startsymbol]
        return c
    def make_first(self, c, name):
        rawfirst = self.first[name]
        first = {}
        for label in rawfirst:
            ilabel = self.make_label(c, label)
            ##assert ilabel not in first # XXX failed on <> ... !=
            first[ilabel] = 1
        return first
    def make_label(self, c, label):
        # XXX Maybe this should be a method on a subclass of converter?
        ilabel = len(c.labels)
        if label[0].isalpha():
            # Either a symbol name or a named token
            if label in c.symbol2number:
                # A symbol name (a non-terminal)
                if label in c.symbol2label:
                    return c.symbol2label[label]
                else:
                    c.labels.append((c.symbol2number[label], None))
                    c.symbol2label[label] = ilabel
                    return ilabel
            else:
                # A named token (NAME, NUMBER, STRING)
                itoken = getattr(token, label, None)
                assert isinstance(itoken, int), label
                assert itoken in token.tok_name, label
                if itoken in c.tokens:
                    return c.tokens[itoken]
                else:
                    c.labels.append((itoken, None))
                    c.tokens[itoken] = ilabel
                    return ilabel
        else:
            # Either a keyword or an operator
            assert label[0] in ('"', "'"), label
            value = eval(label)
            if value[0].isalpha():
                # A keyword
                if value in c.keywords:
                    return c.keywords[value]
                else:
                    c.labels.append((token.NAME, value))
                    c.keywords[value] = ilabel
                    return ilabel
            else:
                # An operator (any non-numeric token)
                itoken = grammar.opmap[value]  # Fails if unknown token
                if itoken in c.tokens:
                    return c.tokens[itoken]
                else:
                    c.labels.append((itoken, None))
                    c.tokens[itoken] = ilabel
                    return ilabel
    def addfirstsets(self):
        names = list(self.dfas.keys())
        names.sort()
        for name in names:
            if name not in self.first:
                self.calcfirst(name)
            #print name, self.first[name].keys()
    def calcfirst(self, name):
        dfa = self.dfas[name]
        self.first[name] = None  # dummy to detect left recursion
        state = dfa[0]
        totalset = {}
        overlapcheck = {}
        for label, next in state.arcs.items():
            if label in self.dfas:
                if label in self.first:
                    fset = self.first[label]
                    if fset is None:
                        raise ValueError("recursion for rule %r" % name)
                else:
                    self.calcfirst(label)
                    fset = self.first[label]
                totalset.update(fset)
                overlapcheck[label] = fset
            else:
                totalset[label] = 1
                overlapcheck[label] = {label: 1}
        inverse = {}
        for label, itsfirst in overlapcheck.items():
            for symbol in itsfirst:
                if symbol in inverse:
                    raise ValueError("rule %s is ambiguous; %s is in the"
                                     " first sets of %s as well as %s" %
                                     (name, symbol, label, inverse[symbol]))
                inverse[symbol] = label
        self.first[name] = totalset
    def parse(self):
        dfas = {}
        startsymbol = None
        # MSTART: (NEWLINE | RULE)* ENDMARKER
        while self.type != token.ENDMARKER:
            while self.type == token.NEWLINE:
                self.gettoken()
            # RULE: NAME ':' RHS NEWLINE
            name = self.expect(token.NAME)
            self.expect(token.OP, ":")
            a, z = self.parse_rhs()
            self.expect(token.NEWLINE)
            #self.dump_nfa(name, a, z)
            dfa = self.make_dfa(a, z)
            #self.dump_dfa(name, dfa)
            # oldlen = len(dfa)
            self.simplify_dfa(dfa)
            # newlen = len(dfa)
            dfas[name] = dfa
            #print name, oldlen, newlen
            if startsymbol is None:
                startsymbol = name
        return dfas, startsymbol
    def make_dfa(self, start, finish):
        # To turn an NFA into a DFA, we define the states of the DFA
        # to correspond to *sets* of states of the NFA.  Then do some
        # state reduction.  Let's represent sets as dicts with 1 for
        # values.
        assert isinstance(start, NFAState)
        assert isinstance(finish, NFAState)
        def closure(state):
            base = {}
            addclosure(state, base)
            return base
        def addclosure(state, base):
            assert isinstance(state, NFAState)
            if state in base:
                return
            base[state] = 1
            for label, next in state.arcs:
                if label is None:
                    addclosure(next, base)
        states = [DFAState(closure(start), finish)]
        for state in states:  # NB states grows while we're iterating
            arcs = {}
            for nfastate in state.nfaset:
                for label, next in nfastate.arcs:
                    if label is not None:
                        addclosure(next, arcs.setdefault(label, {}))
            for label, nfaset in arcs.items():
                for st in states:
                    if st.nfaset == nfaset:
                        break
                else:
                    st = DFAState(nfaset, finish)
                    states.append(st)
                state.addarc(st, label)
        return states  # List of DFAState instances; first one is start
    def dump_nfa(self, name, start, finish):
        print("Dump of NFA for", name)
        todo = [start]
        for i, state in enumerate(todo):
            print("  State", i, state is finish and "(final)" or "")
            for label, next in state.arcs:
                if next in todo:
                    j = todo.index(next)
                else:
                    j = len(todo)
                    todo.append(next)
                if label is None:
                    print("    -> %d" % j)
                else:
                    print("    %s -> %d" % (label, j))
    def dump_dfa(self, name, dfa):
        print("Dump of DFA for", name)
        for i, state in enumerate(dfa):
            print("  State", i, state.isfinal and "(final)" or "")
            for label, next in state.arcs.items():
                print("    %s -> %d" % (label, dfa.index(next)))
    def simplify_dfa(self, dfa):
        # This is not theoretically optimal, but works well enough.
        # Algorithm: repeatedly look for two states that have the same
        # set of arcs (same labels pointing to the same nodes) and
        # unify them, until things stop changing.
        # dfa is a list of DFAState instances
        changes = True
        while changes:
            changes = False
            for i, state_i in enumerate(dfa):
                for j in range(i + 1, len(dfa)):
                    state_j = dfa[j]
                    if state_i == state_j:
                        #print "  unify", i, j
                        del dfa[j]
                        for state in dfa:
                            state.unifystate(state_j, state_i)
                        changes = True
                        break
    def parse_rhs(self):
        # RHS: ALT ('|' ALT)*
        a, z = self.parse_alt()
        if self.value != "|":
            return a, z
        else:
            aa = NFAState()
            zz = NFAState()
            aa.addarc(a)
            z.addarc(zz)
            while self.value == "|":
                self.gettoken()
                a, z = self.parse_alt()
                aa.addarc(a)
                z.addarc(zz)
            return aa, zz
    def parse_alt(self):
        # ALT: ITEM+
        a, b = self.parse_item()
        while (self.value in ("(", "[") or
               self.type in (token.NAME, token.STRING)):
            c, d = self.parse_item()
            b.addarc(c)
            b = d
        return a, b
    def parse_item(self):
        # ITEM: '[' RHS ']' | ATOM ['+' | '*']
        if self.value == "[":
            self.gettoken()
            a, z = self.parse_rhs()
            self.expect(token.OP, "]")
            a.addarc(z)
            return a, z
        else:
            a, z = self.parse_atom()
            value = self.value
            if value not in ("+", "*"):
                return a, z
            self.gettoken()
            z.addarc(a)
            if value == "+":
                return a, z
            else:
                return a, a
    def parse_atom(self):
        # ATOM: '(' RHS ')' | NAME | STRING
        if self.value == "(":
            self.gettoken()
            a, z = self.parse_rhs()
            self.expect(token.OP, ")")
            return a, z
        elif self.type in (token.NAME, token.STRING):
            a = NFAState()
            z = NFAState()
            a.addarc(z, self.value)
            self.gettoken()
            return a, z
        else:
            self.raise_error("expected (...) or NAME or STRING, got %s/%s",
                             self.type, self.value)
    def expect(self, type, value=None):
        if self.type != type or (value is not None and self.value != value):
            self.raise_error("expected %s/%s, got %s/%s",
                             type, value, self.type, self.value)
        value = self.value
        self.gettoken()
        return value
    def gettoken(self):
        tup = next(self.generator)
        while tup[0] in (tokenize.COMMENT, tokenize.NL):
            tup = next(self.generator)
        self.type, self.value, self.begin, self.end, self.line = tup
        #print token.tok_name[self.type], repr(self.value)
    def raise_error(self, msg, *args):
        if args:
            try:
                msg = msg % args
            except:
                msg = " ".join([msg] + list(map(str, args)))
        raise SyntaxError(msg, (self.filename, self.end[0],
                                self.end[1], self.line))
 class NFAState(object):
    def __init__(self):
        self.arcs = []  # list of (label, NFAState) pairs
    def addarc(self, next, label=None):
        assert label is None or isinstance(label, str)
        assert isinstance(next, NFAState)
        self.arcs.append((label, next))
 class DFAState(object):
    def __init__(self, nfaset, final):
        assert isinstance(nfaset, dict)
        assert isinstance(next(iter(nfaset)), NFAState)
        assert isinstance(final, NFAState)
        self.nfaset = nfaset
        self.isfinal = final in nfaset
        self.arcs = {}  # map from label to DFAState
    def addarc(self, next, label):
        assert isinstance(label, str)
        assert label not in self.arcs
        assert isinstance(next, DFAState)
        self.arcs[label] = next
    def unifystate(self, old, new):
        for label, next in self.arcs.items():
            if next is old:
                self.arcs[label] = new
    def __eq__(self, other):
        # Equality test -- ignore the nfaset instance variable
        assert isinstance(other, DFAState)
        if self.isfinal != other.isfinal:
            return False
        # Can't just return self.arcs == other.arcs, because that
        # would invoke this method recursively, with cycles...
        if len(self.arcs) != len(other.arcs):
            return False
        for label, next in self.arcs.items():
            if next is not other.arcs.get(label):
                return False
        return True
    __hash__ = None  # For Py3 compatibility.
 def generate_grammar(filename="Grammar.txt"):
    p = ParserGenerator(filename)
    return p.make_grammar()
--- a/jedi/parser/pgen2/token.py
+++ b/jedi/parser/pgen2/token.py
@@ -0,0 +1,82 @@
 #! /usr/bin/python3.4
 """Token constants (from "token.h")."""
 #  Taken from Python (r53757) and modified to include some tokens
 #   originally monkeypatched in by pgen2.tokenize
 #--start constants--
 ENDMARKER = 0
 NAME = 1
 NUMBER = 2
 STRING = 3
 NEWLINE = 4
 INDENT = 5
 DEDENT = 6
 LPAR = 7
 RPAR = 8
 LSQB = 9
 RSQB = 10
 COLON = 11
 COMMA = 12
 SEMI = 13
 PLUS = 14
 MINUS = 15
 STAR = 16
 SLASH = 17
 VBAR = 18
 AMPER = 19
 LESS = 20
 GREATER = 21
 EQUAL = 22
 DOT = 23
 PERCENT = 24
 BACKQUOTE = 25
 LBRACE = 26
 RBRACE = 27
 EQEQUAL = 28
 NOTEQUAL = 29
 LESSEQUAL = 30
 GREATEREQUAL = 31
 TILDE = 32
 CIRCUMFLEX = 33
 LEFTSHIFT = 34
 RIGHTSHIFT = 35
 DOUBLESTAR = 36
 PLUSEQUAL = 37
 MINEQUAL = 38
 STAREQUAL = 39
 SLASHEQUAL = 40
 PERCENTEQUAL = 41
 AMPEREQUAL = 42
 VBAREQUAL = 43
 CIRCUMFLEXEQUAL = 44
 LEFTSHIFTEQUAL = 45
 RIGHTSHIFTEQUAL = 46
 DOUBLESTAREQUAL = 47
 DOUBLESLASH = 48
 DOUBLESLASHEQUAL = 49
 AT = 50
 OP = 51
 COMMENT = 52
 NL = 53
 RARROW = 54
 ERRORTOKEN = 55
 N_TOKENS = 56
 NT_OFFSET = 256
 #--end constants--
 tok_name = {}
 for _name, _value in list(globals().items()):
    if type(_value) is type(0):
        tok_name[_value] = _name
 def ISTERMINAL(x):
    return x < NT_OFFSET
 def ISNONTERMINAL(x):
    return x >= NT_OFFSET
 def ISEOF(x):
    return x == ENDMARKER
--- a/jedi/parser/pgen2/tokenize.py
+++ b/jedi/parser/pgen2/tokenize.py
@@ -0,0 +1,503 @@
 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
 # All rights reserved.
 """Tokenization help for Python programs.
 generate_tokens(readline) is a generator that breaks a stream of
 text into Python tokens.  It accepts a readline-like method which is called
 repeatedly to get the next line of input (or "" for EOF).  It generates
 5-tuples with these members:
    the token type (see token.py)
    the token (a string)
    the starting (row, column) indices of the token (a 2-tuple of ints)
    the ending (row, column) indices of the token (a 2-tuple of ints)
    the original line (string)
 It is designed to match the working of the Python tokenizer exactly, except
 that it produces COMMENT tokens for comments and gives type OP for all
 operators
 Older entry points
    tokenize_loop(readline, tokeneater)
    tokenize(readline, tokeneater=printtoken)
 are the same, except instead of generating tokens, tokeneater is a callback
 function to which the 5 fields described above are passed as 5 arguments,
 each time a new token is found."""
 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
 __credits__ = \
    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
 import string, re
 from codecs import BOM_UTF8, lookup
 from .token import *
 from . import token
 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
           "generate_tokens", "untokenize"]
 del token
 try:
    bytes
 except NameError:
    # Support bytes type in Python <= 2.5, so 2to3 turns itself into
    # valid Python 3 code.
    bytes = str
 def group(*choices): return '(' + '|'.join(choices) + ')'
 def any(*choices): return group(*choices) + '*'
 def maybe(*choices): return group(*choices) + '?'
 Whitespace = r'[ \f\t]*'
 Comment = r'#[^\r\n]*'
 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
 Name = r'[a-zA-Z_]\w*'
 Binnumber = r'0[bB][01]*'
 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
 Octnumber = r'0[oO]?[0-7]*[lL]?'
 Decnumber = r'[1-9]\d*[lL]?'
 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
 Exponent = r'[eE][-+]?\d+'
 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
 Expfloat = r'\d+' + Exponent
 Floatnumber = group(Pointfloat, Expfloat)
 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
 Number = group(Imagnumber, Floatnumber, Intnumber)
 # Tail end of ' string.
 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 # Tail end of " string.
 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 # Tail end of ''' string.
 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 # Tail end of """ string.
 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
 # Single-line ' or " string.
 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
 # Because of leftmost-then-longest match semantics, be sure to put the
 # longest operators first (e.g., if = came before ==, == would get
 # recognized as two instances of =).
 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
                 r"//=?", r"->",
                 r"[+\-*/%&|^=<>]=?",
                 r"~")
 Bracket = '[][(){}]'
 Special = group(r'\r?\n', r'[:;.,`@]')
 Funny = group(Operator, Bracket, Special)
 PlainToken = group(Number, Funny, String, Name)
 Token = Ignore + PlainToken
 # First (or only) line of ' or " string.
 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                group("'", r'\\\r?\n'),
                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                group('"', r'\\\r?\n'))
 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 tokenprog, pseudoprog, single3prog, double3prog = list(map(
    re.compile, (Token, PseudoToken, Single3, Double3)))
 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
            "'''": single3prog, '"""': double3prog,
            "r'''": single3prog, 'r"""': double3prog,
            "u'''": single3prog, 'u"""': double3prog,
            "b'''": single3prog, 'b"""': double3prog,
            "ur'''": single3prog, 'ur"""': double3prog,
            "br'''": single3prog, 'br"""': double3prog,
            "R'''": single3prog, 'R"""': double3prog,
            "U'''": single3prog, 'U"""': double3prog,
            "B'''": single3prog, 'B"""': double3prog,
            "uR'''": single3prog, 'uR"""': double3prog,
            "Ur'''": single3prog, 'Ur"""': double3prog,
            "UR'''": single3prog, 'UR"""': double3prog,
            "bR'''": single3prog, 'bR"""': double3prog,
            "Br'''": single3prog, 'Br"""': double3prog,
            "BR'''": single3prog, 'BR"""': double3prog,
            'r': None, 'R': None,
            'u': None, 'U': None,
            'b': None, 'B': None}
 triple_quoted = {}
 for t in ("'''", '"""',
          "r'''", 'r"""', "R'''", 'R"""',
          "u'''", 'u"""', "U'''", 'U"""',
          "b'''", 'b"""', "B'''", 'B"""',
          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
          "uR'''", 'uR"""', "UR'''", 'UR"""',
          "br'''", 'br"""', "Br'''", 'Br"""',
          "bR'''", 'bR"""', "BR'''", 'BR"""',):
    triple_quoted[t] = t
 single_quoted = {}
 for t in ("'", '"',
          "r'", 'r"', "R'", 'R"',
          "u'", 'u"', "U'", 'U"',
          "b'", 'b"', "B'", 'B"',
          "ur'", 'ur"', "Ur'", 'Ur"',
          "uR'", 'uR"', "UR'", 'UR"',
          "br'", 'br"', "Br'", 'Br"',
          "bR'", 'bR"', "BR'", 'BR"', ):
    single_quoted[t] = t
 tabsize = 8
 class TokenError(Exception): pass
 class StopTokenizing(Exception): pass
 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
    (srow, scol) = xxx_todo_changeme
    (erow, ecol) = xxx_todo_changeme1
    print("%d,%d-%d,%d:\t%s\t%s" % \
        (srow, scol, erow, ecol, tok_name[type], repr(token)))
 def tokenize(readline, tokeneater=printtoken):
    """
    The tokenize() function accepts two parameters: one representing the
    input stream, and one providing an output mechanism for tokenize().
    The first parameter, readline, must be a callable object which provides
    the same interface as the readline() method of built-in file objects.
    Each call to the function should return one line of input as a string.
    The second parameter, tokeneater, must also be a callable object. It is
    called once for each token, with five arguments, corresponding to the
    tuples generated by generate_tokens().
    """
    try:
        tokenize_loop(readline, tokeneater)
    except StopTokenizing:
        pass
 # backwards compatible interface
 def tokenize_loop(readline, tokeneater):
    for token_info in generate_tokens(readline):
        tokeneater(*token_info)
 class Untokenizer:
    def __init__(self):
        self.tokens = []
        self.prev_row = 1
        self.prev_col = 0
    def add_whitespace(self, start):
        row, col = start
        assert row <= self.prev_row
        col_offset = col - self.prev_col
        if col_offset:
            self.tokens.append(" " * col_offset)
    def untokenize(self, iterable):
        for t in iterable:
            if len(t) == 2:
                self.compat(t, iterable)
                break
            tok_type, token, start, end, line = t
            self.add_whitespace(start)
            self.tokens.append(token)
            self.prev_row, self.prev_col = end
            if tok_type in (NEWLINE, NL):
                self.prev_row += 1
                self.prev_col = 0
        return "".join(self.tokens)
    def compat(self, token, iterable):
        startline = False
        indents = []
        toks_append = self.tokens.append
        toknum, tokval = token
        if toknum in (NAME, NUMBER):
            tokval += ' '
        if toknum in (NEWLINE, NL):
            startline = True
        for tok in iterable:
            toknum, tokval = tok[:2]
            if toknum in (NAME, NUMBER):
                tokval += ' '
            if toknum == INDENT:
                indents.append(tokval)
                continue
            elif toknum == DEDENT:
                indents.pop()
                continue
            elif toknum in (NEWLINE, NL):
                startline = True
            elif startline and indents:
                toks_append(indents[-1])
                startline = False
            toks_append(tokval)
 cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 def _get_normal_name(orig_enc):
    """Imitates get_normal_name in tokenizer.c."""
    # Only care about the first 12 characters.
    enc = orig_enc[:12].lower().replace("_", "-")
    if enc == "utf-8" or enc.startswith("utf-8-"):
        return "utf-8"
    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
        return "iso-8859-1"
    return orig_enc
 def detect_encoding(readline):
    """
    The detect_encoding() function is used to detect the encoding that should
    be used to decode a Python source file. It requires one argument, readline,
    in the same way as the tokenize() generator.
    It will call readline a maximum of twice, and return the encoding used
    (as a string) and a list of any lines (left as bytes) it has read
    in.
    It detects the encoding from the presence of a utf-8 bom or an encoding
    cookie as specified in pep-0263. If both a bom and a cookie are present, but
    disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
    charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
    'utf-8-sig' is returned.
    If no encoding is specified, then the default of 'utf-8' will be returned.
    """
    bom_found = False
    encoding = None
    default = 'utf-8'
    def read_or_stop():
        try:
            return readline()
        except StopIteration:
            return bytes()
    def find_cookie(line):
        try:
            line_string = line.decode('ascii')
        except UnicodeDecodeError:
            return None
        match = cookie_re.match(line_string)
        if not match:
            return None
        encoding = _get_normal_name(match.group(1))
        try:
            codec = lookup(encoding)
        except LookupError:
            # This behaviour mimics the Python interpreter
            raise SyntaxError("unknown encoding: " + encoding)
        if bom_found:
            if codec.name != 'utf-8':
                # This behaviour mimics the Python interpreter
                raise SyntaxError('encoding problem: utf-8')
            encoding += '-sig'
        return encoding
    first = read_or_stop()
    if first.startswith(BOM_UTF8):
        bom_found = True
        first = first[3:]
        default = 'utf-8-sig'
    if not first:
        return default, []
    encoding = find_cookie(first)
    if encoding:
        return encoding, [first]
    if not blank_re.match(first):
        return default, [first]
    second = read_or_stop()
    if not second:
        return default, [first]
    encoding = find_cookie(second)
    if encoding:
        return encoding, [first, second]
    return default, [first, second]
 def untokenize(iterable):
    """Transform tokens back into Python source code.
    Each element returned by the iterable must be a token sequence
    with at least two elements, a token number and token value.  If
    only two tokens are passed, the resulting output is poor.
    Round-trip invariant for full input:
        Untokenized source will match input source exactly
    Round-trip invariant for limited intput:
        # Output text will tokenize the back to the input
        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
        newcode = untokenize(t1)
        readline = iter(newcode.splitlines(1)).next
        t2 = [tok[:2] for tokin generate_tokens(readline)]
        assert t1 == t2
    """
    ut = Untokenizer()
    return ut.untokenize(iterable)
 def generate_tokens(readline):
    """
    The generate_tokens() generator requires one argument, readline, which
    must be a callable object which provides the same interface as the
    readline() method of built-in file objects. Each call to the function
    should return one line of input as a string.  Alternately, readline
    can be a callable function terminating with StopIteration:
        readline = open(myfile).next    # Example of alternate readline
    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
    column where the token begins in the source; a 2-tuple (erow, ecol) of
    ints specifying the row and column where the token ends in the source;
    and the line on which the token was found. The line passed is the
    logical line; continuation lines are included.
    """
    lnum = parenlev = continued = 0
    namechars, numchars = string.ascii_letters + '_', '0123456789'
    contstr, needcont = '', 0
    contline = None
    indents = [0]
    while 1:                                   # loop over lines in stream
        try:
            line = readline()
        except StopIteration:
            line = ''
        lnum = lnum + 1
        pos, max = 0, len(line)
        if contstr:                            # continued string
            if not line:
                raise TokenError("EOF in multi-line string", strstart)
            endmatch = endprog.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
                yield (STRING, contstr + line[:end],
                       strstart, (lnum, end), contline + line)
                contstr, needcont = '', 0
                contline = None
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
                yield (ERRORTOKEN, contstr + line,
                           strstart, (lnum, len(line)), contline)
                contstr = ''
                contline = None
                continue
            else:
                contstr = contstr + line
                contline = contline + line
                continue
        elif parenlev == 0 and not continued:  # new statement
            if not line: break
            column = 0
            while pos < max:                   # measure leading whitespace
                if line[pos] == ' ': column = column + 1
                elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
                elif line[pos] == '\f': column = 0
                else: break
                pos = pos + 1
            if pos == max: break
            if line[pos] in '#\r\n':           # skip comments or blank lines
                if line[pos] == '#':
                    comment_token = line[pos:].rstrip('\r\n')
                    nl_pos = pos + len(comment_token)
                    yield (COMMENT, comment_token,
                           (lnum, pos), (lnum, pos + len(comment_token)), line)
                    yield (NL, line[nl_pos:],
                           (lnum, nl_pos), (lnum, len(line)), line)
                else:
                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
                           (lnum, pos), (lnum, len(line)), line)
                continue
            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
            while column < indents[-1]:
                if column not in indents:
                    raise IndentationError(
                        "unindent does not match any outer indentation level",
                        ("<tokenize>", lnum, pos, line))
                indents = indents[:-1]
                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
        else:                                  # continued statement
            if not line:
                raise TokenError("EOF in multi-line statement", (lnum, 0))
            continued = 0
        while pos < max:
            pseudomatch = pseudoprog.match(line, pos)
            if pseudomatch:                                # scan for tokens
                start, end = pseudomatch.span(1)
                spos, epos, pos = (lnum, start), (lnum, end), end
                token, initial = line[start:end], line[start]
                if initial in numchars or \
                   (initial == '.' and token != '.'):      # ordinary number
                    yield (NUMBER, token, spos, epos, line)
                elif initial in '\r\n':
                    newline = NEWLINE
                    if parenlev > 0:
                        newline = NL
                    yield (newline, token, spos, epos, line)
                elif initial == '#':
                    assert not token.endswith("\n")
                    yield (COMMENT, token, spos, epos, line)
                elif token in triple_quoted:
                    endprog = endprogs[token]
                    endmatch = endprog.match(line, pos)
                    if endmatch:                           # all on one line
                        pos = endmatch.end(0)
                        token = line[start:pos]
                        yield (STRING, token, spos, (lnum, pos), line)
                    else:
                        strstart = (lnum, start)           # multiple lines
                        contstr = line[start:]
                        contline = line
                        break
                elif initial in single_quoted or \
                    token[:2] in single_quoted or \
                    token[:3] in single_quoted:
                    if token[-1] == '\n':                  # continued string
                        strstart = (lnum, start)
                        endprog = (endprogs[initial] or endprogs[token[1]] or
                                   endprogs[token[2]])
                        contstr, needcont = line[start:], 1
                        contline = line
                        break
                    else:                                  # ordinary string
                        yield (STRING, token, spos, epos, line)
                elif initial in namechars:                 # ordinary name
                    yield (NAME, token, spos, epos, line)
                elif initial == '\\':                      # continued stmt
                    # This yield is new; needed for better idempotency:
                    yield (NL, token, spos, (lnum, pos), line)
                    continued = 1
                else:
                    if initial in '([{': parenlev = parenlev + 1
                    elif initial in ')]}': parenlev = parenlev - 1
                    yield (OP, token, spos, epos, line)
            else:
                yield (ERRORTOKEN, line[pos],
                           (lnum, pos), (lnum, pos+1), line)
                pos = pos + 1
    for indent in indents[1:]:                 # pop remaining indent levels
        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 if __name__ == '__main__':                     # testing
    import sys
    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
    else: tokenize(sys.stdin.readline)
--- a/jedi/parser/pytree.py
+++ b/jedi/parser/pytree.py
@@ -0,0 +1,456 @@
 # Copyright 2006 Google, Inc. All Rights Reserved.
 # Licensed to PSF under a Contributor Agreement.
 """
 Python parse tree definitions.
 This is a very concrete parse tree; we need to keep every token and
 even the comments and whitespace between tokens.
 There's also a pattern matching implementation here.
 """
 __author__ = "Guido van Rossum <guido@python.org>"
 import sys
 import os
 from . import pgen2
 HUGE = 0x7FFFFFFF  # maximum repeat count, default max
 _type_reprs = {}
 # The grammar file
 _GRAMMAR_FILE = os.path.join(os.path.dirname(__file__), "grammar.txt")
 class Symbols(object):
    def __init__(self, grammar):
        """Initializer.
        Creates an attribute for each grammar symbol (nonterminal),
        whose value is the symbol's type (an int >= 256).
        """
        for name, symbol in grammar.symbol2number.items():
            setattr(self, name, symbol)
 python_grammar = pgen2.load_grammar(_GRAMMAR_FILE)
 python_symbols = Symbols(python_grammar)
 python_grammar_no_print_statement = python_grammar.copy()
 del python_grammar_no_print_statement.keywords["print"]
 def type_repr(type_num):
    global _type_reprs
    if not _type_reprs:
        # printing tokens is possible but not as useful
        # from .pgen2 import token // token.__dict__.items():
        for name, val in python_symbols.__dict__.items():
            if type(val) == int:
                _type_reprs[val] = name
    return _type_reprs.setdefault(type_num, type_num)
 class Base(object):
    """
    Abstract base class for Node and Leaf.
    This provides some default functionality and boilerplate using the
    template pattern.
    A node may be a subnode of at most one parent.
    """
    # Default values for instance variables
    type = None    # int: token number (< 256) or symbol number (>= 256)
    parent = None  # Parent node pointer, or None
    children = ()  # Tuple of subnodes
    was_changed = False
    was_checked = False
    def __new__(cls, *args, **kwds):
        """Constructor that prevents Base from being instantiated."""
        assert cls is not Base, "Cannot instantiate Base"
        return object.__new__(cls)
    def __eq__(self, other):
        """
        Compare two nodes for equality.
        This calls the method _eq().
        """
        if self.__class__ is not other.__class__:
            return NotImplemented
        return self._eq(other)
    __hash__ = None  # For Py3 compatibility.
    def __ne__(self, other):
        """
        Compare two nodes for inequality.
        This calls the method _eq().
        """
        if self.__class__ is not other.__class__:
            return NotImplemented
        return not self._eq(other)
    def _eq(self, other):
        """
        Compare two nodes for equality.
        This is called by __eq__ and __ne__.  It is only called if the two nodes
        have the same type.  This must be implemented by the concrete subclass.
        Nodes should be considered equal if they have the same structure,
        ignoring the prefix string and other context information.
        """
        raise NotImplementedError
    def clone(self):
        """
        Return a cloned (deep) copy of self.
        This must be implemented by the concrete subclass.
        """
        raise NotImplementedError
    def post_order(self):
        """
        Return a post-order iterator for the tree.
        This must be implemented by the concrete subclass.
        """
        raise NotImplementedError
    def pre_order(self):
        """
        Return a pre-order iterator for the tree.
        This must be implemented by the concrete subclass.
        """
        raise NotImplementedError
    def replace(self, new):
        """Replace this node with a new one in the parent."""
        assert self.parent is not None, str(self)
        assert new is not None
        if not isinstance(new, list):
            new = [new]
        l_children = []
        found = False
        for ch in self.parent.children:
            if ch is self:
                assert not found, (self.parent.children, self, new)
                if new is not None:
                    l_children.extend(new)
                found = True
            else:
                l_children.append(ch)
        assert found, (self.children, self, new)
        self.parent.changed()
        self.parent.children = l_children
        for x in new:
            x.parent = self.parent
        self.parent = None
    def get_lineno(self):
        """Return the line number which generated the invocant node."""
        node = self
        while not isinstance(node, Leaf):
            if not node.children:
                return
            node = node.children[0]
        return node.lineno
    def changed(self):
        if self.parent:
            self.parent.changed()
        self.was_changed = True
    def remove(self):
        """
        Remove the node from the tree. Returns the position of the node in its
        parent's children before it was removed.
        """
        if self.parent:
            for i, node in enumerate(self.parent.children):
                if node is self:
                    self.parent.changed()
                    del self.parent.children[i]
                    self.parent = None
                    return i
    @property
    def next_sibling(self):
        """
        The node immediately following the invocant in their parent's children
        list. If the invocant does not have a next sibling, it is None
        """
        if self.parent is None:
            return None
        # Can't use index(); we need to test by identity
        for i, child in enumerate(self.parent.children):
            if child is self:
                try:
                    return self.parent.children[i + 1]
                except IndexError:
                    return None
    @property
    def prev_sibling(self):
        """
        The node immediately preceding the invocant in their parent's children
        list. If the invocant does not have a previous sibling, it is None.
        """
        if self.parent is None:
            return None
        # Can't use index(); we need to test by identity
        for i, child in enumerate(self.parent.children):
            if child is self:
                if i == 0:
                    return None
                return self.parent.children[i - 1]
    def leaves(self):
        for child in self.children:
            for leave in child.leaves():
                yield leave
    def depth(self):
        if self.parent is None:
            return 0
        return 1 + self.parent.depth()
    def get_suffix(self):
        """
        Return the string immediately following the invocant node. This is
        effectively equivalent to node.next_sibling.prefix
        """
        next_sib = self.next_sibling
        if next_sib is None:
            return ""
        return next_sib.prefix
    if sys.version_info < (3, 0):
        def __str__(self):
            return str(self).encode("ascii")
 class Node(Base):
    """Concrete implementation for interior nodes."""
    def __init__(self, type, children,
                 context=None,
                 prefix=None,
                 fixers_applied=None):
        """
        Initializer.
        Takes a type constant (a symbol number >= 256), a sequence of
        child nodes, and an optional context keyword argument.
        As a side effect, the parent pointers of the children are updated.
        """
        assert type >= 256, type
        self.type = type
        self.children = list(children)
        for ch in self.children:
            assert ch.parent is None, repr(ch)
            ch.parent = self
        if prefix is not None:
            self.prefix = prefix
        if fixers_applied:
            self.fixers_applied = fixers_applied[:]
        else:
            self.fixers_applied = None
    def __repr__(self):
        """Return a canonical string representation."""
        return "%s(%s, %r)" % (self.__class__.__name__,
                               type_repr(self.type),
                               self.children)
    def __unicode__(self):
        """
        Return a pretty string representation.
        This reproduces the input source exactly.
        """
        return "".join(map(str, self.children))
    if sys.version_info > (3, 0):
        __str__ = __unicode__
    def _eq(self, other):
        """Compare two nodes for equality."""
        return (self.type, self.children) == (other.type, other.children)
    def clone(self):
        """Return a cloned (deep) copy of self."""
        return Node(self.type, [ch.clone() for ch in self.children],
                    fixers_applied=self.fixers_applied)
    def post_order(self):
        """Return a post-order iterator for the tree."""
        for child in self.children:
            for el in child.post_order():
                yield el
        yield self
    def pre_order(self):
        """Return a pre-order iterator for the tree."""
        yield self
        for child in self.children:
            for el in child.post_order():
                yield el
    def _prefix_getter(self):
        """
        The whitespace and comments preceding this node in the input.
        """
        if not self.children:
            return ""
        return self.children[0].prefix
    def _prefix_setter(self, prefix):
        if self.children:
            self.children[0].prefix = prefix
    prefix = property(_prefix_getter, _prefix_setter)
    def set_child(self, i, child):
        """
        Equivalent to 'node.children[i] = child'. This method also sets the
        child's parent attribute appropriately.
        """
        child.parent = self
        self.children[i].parent = None
        self.children[i] = child
        self.changed()
    def insert_child(self, i, child):
        """
        Equivalent to 'node.children.insert(i, child)'. This method also sets
        the child's parent attribute appropriately.
        """
        child.parent = self
        self.children.insert(i, child)
        self.changed()
    def append_child(self, child):
        """
        Equivalent to 'node.children.append(child)'. This method also sets the
        child's parent attribute appropriately.
        """
        child.parent = self
        self.children.append(child)
        self.changed()
 class Leaf(Base):
    """Concrete implementation for leaf nodes."""
    # Default values for instance variables
    _prefix = ""  # Whitespace and comments preceding this token in the input
    lineno = 0    # Line where this token starts in the input
    column = 0    # Column where this token tarts in the input
    def __init__(self, type, value,
                 context=None,
                 prefix=None,
                 fixers_applied=[]):
        """
        Initializer.
        Takes a type constant (a token number < 256), a string value, and an
        optional context keyword argument.
        """
        assert 0 <= type < 256, type
        if context is not None:
            self._prefix, (self.lineno, self.column) = context
        self.type = type
        self.value = value
        if prefix is not None:
            self._prefix = prefix
        self.fixers_applied = fixers_applied[:]
    def __repr__(self):
        """Return a canonical string representation."""
        return "%s(%r, %r)" % (self.__class__.__name__,
                               self.type,
                               self.value)
    def __unicode__(self):
        """
        Return a pretty string representation.
        This reproduces the input source exactly.
        """
        return self.prefix + str(self.value)
    if sys.version_info > (3, 0):
        __str__ = __unicode__
    def _eq(self, other):
        """Compare two nodes for equality."""
        return (self.type, self.value) == (other.type, other.value)
    def clone(self):
        """Return a cloned (deep) copy of self."""
        return Leaf(self.type, self.value,
                    (self.prefix, (self.lineno, self.column)),
                    fixers_applied=self.fixers_applied)
    def leaves(self):
        yield self
    def post_order(self):
        """Return a post-order iterator for the tree."""
        yield self
    def pre_order(self):
        """Return a pre-order iterator for the tree."""
        yield self
    def _prefix_getter(self):
        """
        The whitespace and comments preceding this token in the input.
        """
        return self._prefix
    def _prefix_setter(self, prefix):
        self.changed()
        self._prefix = prefix
    prefix = property(_prefix_getter, _prefix_setter)
 def convert(gr, raw_node):
    """
    Convert raw node information to a Node or Leaf instance.
    This is passed to the parser driver which calls it whenever a reduction of a
    grammar rule produces a new complete node, so that the tree is build
    strictly bottom-up.
    """
    #import pdb; pdb.set_trace()
    print(raw_node)
    type, value, context, children = raw_node
    if children or type in gr.number2symbol:
        # If there's exactly one child, return that child instead of
        # creating a new node.
        if len(children) == 1:
            return children[0]
        return Node(type, children, context=context)
    else:
        return Leaf(type, value, context=context)
--- a/test/test_new_parser.py
+++ b/test/test_new_parser.py
@@ -0,0 +1,27 @@
 import logging
 from jedi.parser import pytree
 from jedi.parser.pgen2 import Driver
 def test_basic():
    #if self.options["print_function"]:
    #    python_grammar = pygram.python_grammar_no_print_statement
    #else:
    # When this is True, the refactor*() methods will call write_file() for
    # files processed even if they were not changed during refactoring. If
    # and only if the refactor method's write parameter was True.
    logger = logging.getLogger("RefactoringTool")
    d = Driver(pytree.python_grammar, convert=pytree.convert, logger=logger)
    print(d)
    tree = d.parse_string('wblabla* 1\t\n')
    print(repr(tree))
    print(tree)
    #import pdb; pdb.set_trace()
    print(repr(d.parse_string('def x(): pass\n')))
    print()
    x = d.parse_string('\na #pass\n')
    print(repr(x))
    print(x)
    assert False