Add a new parser, check it pgen2 would work. (already modified outside this repository)

2025-12-08 06:44:46 +08:00 · 2014-10-08 14:26:52 +02:00
parent 09a7317bc9
commit 834172a3e9
9 changed files with 2099 additions and 0 deletions
--- a/jedi/parser/grammar.txt
+++ b/jedi/parser/grammar.txt
@@ -0,0 +1,158 @@
+# Grammar for 2to3. This grammar supports Python 2.x and 3.x.
+
+# Note:  Changing the grammar specified in this file will most likely
+#        require corresponding changes in the parser module
+#        (../Modules/parsermodule.c).  If you can't make the changes to
+#        that module yourself, please co-ordinate the required changes
+#        with someone who can; ask around on python-dev for help.  Fred
+#        Drake <fdrake@acm.org> will probably be listening there.
+
+# NOTE WELL: You should also follow all the steps listed in PEP 306,
+# "How to Change Python's Grammar"
+
+# Commands for Kees Blom's railroad program
+#diagram:token NAME
+#diagram:token NUMBER
+#diagram:token STRING
+#diagram:token NEWLINE
+#diagram:token ENDMARKER
+#diagram:token INDENT
+#diagram:output\input python.bla
+#diagram:token DEDENT
+#diagram:output\textwidth 20.04cm\oddsidemargin  0.0cm\evensidemargin 0.0cm
+#diagram:rules
+
+# Start symbols for the grammar:
+#	file_input is a module or sequence of commands read from an input file;
+#	single_input is a single interactive statement;
+#	eval_input is the input for the eval() and input() functions.
+# NB: compound_stmt in single_input is followed by extra NEWLINE!
+file_input: (NEWLINE | stmt)* ENDMARKER
+single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
+eval_input: testlist NEWLINE* ENDMARKER
+
+decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
+decorators: decorator+
+decorated: decorators (classdef | funcdef)
+funcdef: 'def' NAME parameters ['->' test] ':' suite
+parameters: '(' [typedargslist] ')'
+typedargslist: ((tfpdef ['=' test] ',')*
+                ('*' [tname] (',' tname ['=' test])* [',' '**' tname] | '**' tname)
+                | tfpdef ['=' test] (',' tfpdef ['=' test])* [','])
+tname: NAME [':' test]
+tfpdef: tname | '(' tfplist ')'
+tfplist: tfpdef (',' tfpdef)* [',']
+varargslist: ((vfpdef ['=' test] ',')*
+              ('*' [vname] (',' vname ['=' test])*  [',' '**' vname] | '**' vname)
+              | vfpdef ['=' test] (',' vfpdef ['=' test])* [','])
+vname: NAME
+vfpdef: vname | '(' vfplist ')'
+vfplist: vfpdef (',' vfpdef)* [',']
+
+stmt: simple_stmt | compound_stmt
+simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
+small_stmt: (expr_stmt | print_stmt  | del_stmt | pass_stmt | flow_stmt |
+             import_stmt | global_stmt | exec_stmt | assert_stmt)
+expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) |
+                     ('=' (yield_expr|testlist_star_expr))*)
+testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
+augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
+            '<<=' | '>>=' | '**=' | '//=')
+# For normal assignments, additional restrictions enforced by the interpreter
+print_stmt: 'print' ( [ test (',' test)* [','] ] |
+                      '>>' test [ (',' test)+ [','] ] )
+del_stmt: 'del' exprlist
+pass_stmt: 'pass'
+flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
+break_stmt: 'break'
+continue_stmt: 'continue'
+return_stmt: 'return' [testlist]
+yield_stmt: yield_expr
+raise_stmt: 'raise' [test ['from' test | ',' test [',' test]]]
+import_stmt: import_name | import_from
+import_name: 'import' dotted_as_names
+import_from: ('from' ('.'* dotted_name | '.'+)
+              'import' ('*' | '(' import_as_names ')' | import_as_names))
+import_as_name: NAME ['as' NAME]
+dotted_as_name: dotted_name ['as' NAME]
+import_as_names: import_as_name (',' import_as_name)* [',']
+dotted_as_names: dotted_as_name (',' dotted_as_name)*
+dotted_name: NAME ('.' NAME)*
+global_stmt: ('global' | 'nonlocal') NAME (',' NAME)*
+exec_stmt: 'exec' expr ['in' test [',' test]]
+assert_stmt: 'assert' test [',' test]
+
+compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated
+if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
+while_stmt: 'while' test ':' suite ['else' ':' suite]
+for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
+try_stmt: ('try' ':' suite
+           ((except_clause ':' suite)+
+	    ['else' ':' suite]
+	    ['finally' ':' suite] |
+	   'finally' ':' suite))
+with_stmt: 'with' with_item (',' with_item)*  ':' suite
+with_item: test ['as' expr]
+with_var: 'as' expr
+# NB compile.c makes sure that the default except clause is last
+except_clause: 'except' [test [(',' | 'as') test]]
+suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
+
+# Backward compatibility cruft to support:
+# [ x for x in lambda: True, lambda: False if x() ]
+# even while also allowing:
+# lambda x: 5 if x else 2
+# (But not a mix of the two)
+testlist_safe: old_test [(',' old_test)+ [',']]
+old_test: or_test | old_lambdef
+old_lambdef: 'lambda' [varargslist] ':' old_test
+
+test: or_test ['if' or_test 'else' test] | lambdef
+or_test: and_test ('or' and_test)*
+and_test: not_test ('and' not_test)*
+not_test: 'not' not_test | comparison
+comparison: expr (comp_op expr)*
+comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
+star_expr: '*' expr
+expr: xor_expr ('|' xor_expr)*
+xor_expr: and_expr ('^' and_expr)*
+and_expr: shift_expr ('&' shift_expr)*
+shift_expr: arith_expr (('<<'|'>>') arith_expr)*
+arith_expr: term (('+'|'-') term)*
+term: factor (('*'|'/'|'%'|'//') factor)*
+factor: ('+'|'-'|'~') factor | power
+power: atom trailer* ['**' factor]
+atom: ('(' [yield_expr|testlist_gexp] ')' |
+       '[' [listmaker] ']' |
+       '{' [dictsetmaker] '}' |
+       '`' testlist1 '`' |
+       NAME | NUMBER | STRING+ | '.' '.' '.')
+listmaker: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
+testlist_gexp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
+lambdef: 'lambda' [varargslist] ':' test
+trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
+subscriptlist: subscript (',' subscript)* [',']
+subscript: test | [test] ':' [test] [sliceop]
+sliceop: ':' [test]
+exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
+testlist: test (',' test)* [',']
+dictsetmaker: ( (test ':' test (comp_for | (',' test ':' test)* [','])) |
+                (test (comp_for | (',' test)* [','])) )
+
+classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
+
+arglist: (argument ',')* (argument [',']
+                         |'*' test (',' argument)* [',' '**' test] 
+                         |'**' test)
+argument: test [comp_for] | test '=' test  # Really [keyword '='] test
+
+comp_iter: comp_for | comp_if
+comp_for: 'for' exprlist 'in' testlist_safe [comp_iter]
+comp_if: 'if' old_test [comp_iter]
+
+testlist1: test (',' test)*
+
+# not used in grammar, but may appear in "node" passed from Parser to Compiler
+encoding_decl: NAME
+
+yield_expr: 'yield' [testlist]
--- a/jedi/parser/pgen2/init.py
+++ b/jedi/parser/pgen2/init.py
@@ -0,0 +1,114 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+# Modifications:
+# Copyright 2006 Google, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+__all__ = ["Driver", "load_grammar"]
+
+
+import os
+import sys
+import logging
+import io
+
+from . import pgen
+from . import grammar
+from . import parse
+from . import token
+from . import tokenize
+
+
+class Driver(object):
+    def __init__(self, grammar, convert=None, logger=None):
+        self.grammar = grammar
+        if logger is None:
+            logger = logging.getLogger()
+        self.logger = logger
+        self.convert = convert
+
+    def parse_tokens(self, tokens):
+        """Parse a series of tokens and return the syntax tree."""
+        # XXX Move the prefix computation into a wrapper around tokenize.
+        p = parse.Parser(self.grammar, self.convert)
+        lineno = 1
+        column = 0
+        type = value = start = end = line_text = None
+        prefix = ""
+        for quintuple in tokens:
+            type, value, start, end, line_text = quintuple
+            if start != (lineno, column):
+                assert (lineno, column) <= start, ((lineno, column), start)
+                s_lineno, s_column = start
+                if lineno < s_lineno:
+                    prefix += "\n" * (s_lineno - lineno)
+                    lineno = s_lineno
+                    column = 0
+                if column < s_column:
+                    prefix += line_text[column:s_column]
+                    column = s_column
+            if type in (tokenize.COMMENT, tokenize.NL):  # NL != NEWLINE
+                prefix += value
+                lineno, column = end
+                if value.endswith("\n"):
+                    lineno += 1
+                    column = 0
+                continue
+            if type == token.OP:
+                type = grammar.opmap[value]
+            #self.logger.debug("%s %r (prefix=%r)", token.tok_name[type], value, prefix)
+            if p.addtoken(type, value, (prefix, start)):
+                break
+            prefix = ""
+            lineno, column = end
+            if value.endswith("\n"):
+                lineno += 1
+                column = 0
+        else:
+            # We never broke out -- EOF is too soon (how can this happen???)
+            raise parse.ParseError("incomplete input",
+                                   type, value, (prefix, start))
+        return p.rootnode
+
+    def parse_string(self, text):
+        """Parse a string and return the syntax tree."""
+        tokens = tokenize.generate_tokens(io.StringIO(text).readline)
+        return self.parse_tokens(tokens)
+
+
+def load_grammar(grammar_path="grammar.txt", pickle_path=None,
+                 save=True, force=False, logger=None):
+    """Load the grammar (maybe from a pickle)."""
+    if logger is None:
+        logger = logging.getLogger()
+    if pickle_path is None:
+        head, tail = os.path.splitext(grammar_path)
+        if tail == ".txt":
+            tail = ""
+        pickle_path = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
+    if force or not _newer(pickle_path, grammar_path):
+        logger.info("Generating grammar tables from %s", grammar_path)
+        g = pgen.generate_grammar(grammar_path)
+        # the pickle files mismatch, when built on different architectures.
+        # don't save these for now. An alternative solution might be to
+        # include the multiarch triplet into the file name
+        if False:
+            logger.info("Writing grammar tables to %s", pickle_path)
+            try:
+                g.dump(pickle_path)
+            except OSError as e:
+                logger.info("Writing failed:" + str(e))
+    else:
+        g = grammar.Grammar()
+        g.load(pickle_path)
+    return g
+
+
+def _newer(a, b):
+    """Inquire whether file a was written since file b."""
+    if not os.path.exists(a):
+        return False
+    if not os.path.exists(b):
+        return True
+    return os.path.getmtime(a) >= os.path.getmtime(b)
--- a/jedi/parser/pgen2/grammar.py
+++ b/jedi/parser/pgen2/grammar.py
@@ -0,0 +1,182 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+"""This module defines the data structures used to represent a grammar.
+
+These are a bit arcane because they are derived from the data
+structures used by Python's 'pgen' parser generator.
+
+There's also a table here mapping operators to their names in the
+token module; the Python tokenize module reports all operators as the
+fallback token code OP, but the parser needs the actual token code.
+
+"""
+
+# Python imports
+import pickle
+
+# Local imports
+from . import token
+
+
+class Grammar(object):
+    """Pgen parsing tables conversion class.
+
+    Once initialized, this class supplies the grammar tables for the
+    parsing engine implemented by parse.py.  The parsing engine
+    accesses the instance variables directly.  The class here does not
+    provide initialization of the tables; several subclasses exist to
+    do this (see the conv and pgen modules).
+
+    The load() method reads the tables from a pickle file, which is
+    much faster than the other ways offered by subclasses.  The pickle
+    file is written by calling dump() (after loading the grammar
+    tables using a subclass).  The report() method prints a readable
+    representation of the tables to stdout, for debugging.
+
+    The instance variables are as follows:
+
+    symbol2number -- a dict mapping symbol names to numbers.  Symbol
+                     numbers are always 256 or higher, to distinguish
+                     them from token numbers, which are between 0 and
+                     255 (inclusive).
+
+    number2symbol -- a dict mapping numbers to symbol names;
+                     these two are each other's inverse.
+
+    states        -- a list of DFAs, where each DFA is a list of
+                     states, each state is a list of arcs, and each
+                     arc is a (i, j) pair where i is a label and j is
+                     a state number.  The DFA number is the index into
+                     this list.  (This name is slightly confusing.)
+                     Final states are represented by a special arc of
+                     the form (0, j) where j is its own state number.
+
+    dfas          -- a dict mapping symbol numbers to (DFA, first)
+                     pairs, where DFA is an item from the states list
+                     above, and first is a set of tokens that can
+                     begin this grammar rule (represented by a dict
+                     whose values are always 1).
+
+    labels        -- a list of (x, y) pairs where x is either a token
+                     number or a symbol number, and y is either None
+                     or a string; the strings are keywords.  The label
+                     number is the index in this list; label numbers
+                     are used to mark state transitions (arcs) in the
+                     DFAs.
+
+    start         -- the number of the grammar's start symbol.
+
+    keywords      -- a dict mapping keyword strings to arc labels.
+
+    tokens        -- a dict mapping token numbers to arc labels.
+
+    """
+
+    def __init__(self):
+        self.symbol2number = {}
+        self.number2symbol = {}
+        self.states = []
+        self.dfas = {}
+        self.labels = [(0, "EMPTY")]
+        self.keywords = {}
+        self.tokens = {}
+        self.symbol2label = {}
+        self.start = 256
+
+    def dump(self, filename):
+        """Dump the grammar tables to a pickle file."""
+        with open(filename, "wb") as f:
+            pickle.dump(self.__dict__, f, 2)
+
+    def load(self, filename):
+        """Load the grammar tables from a pickle file."""
+        with open(filename, "rb") as f:
+            d = pickle.load(f)
+        self.__dict__.update(d)
+
+    def copy(self):
+        """
+        Copy the grammar.
+        """
+        new = self.__class__()
+        for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
+                          "tokens", "symbol2label"):
+            setattr(new, dict_attr, getattr(self, dict_attr).copy())
+        new.labels = self.labels[:]
+        new.states = self.states[:]
+        new.start = self.start
+        return new
+
+    def report(self):
+        """Dump the grammar tables to standard output, for debugging."""
+        from pprint import pprint
+        print("s2n")
+        pprint(self.symbol2number)
+        print("n2s")
+        pprint(self.number2symbol)
+        print("states")
+        pprint(self.states)
+        print("dfas")
+        pprint(self.dfas)
+        print("labels")
+        pprint(self.labels)
+        print("start", self.start)
+
+
+# Map from operator to number (since tokenize doesn't do this)
+
+opmap_raw = """
+( LPAR
+) RPAR
+[ LSQB
+] RSQB
+: COLON
+, COMMA
+; SEMI
+ PLUS
+- MINUS
+* STAR
+/ SLASH
+| VBAR
+& AMPER
+< LESS
+> GREATER
+= EQUAL
+. DOT
+% PERCENT
+` BACKQUOTE
+{ LBRACE
+} RBRACE
+@ AT
+== EQEQUAL
+!= NOTEQUAL
+<> NOTEQUAL
+<= LESSEQUAL
+>= GREATEREQUAL
+~ TILDE
+^ CIRCUMFLEX
+<< LEFTSHIFT
+>> RIGHTSHIFT
+** DOUBLESTAR
+= PLUSEQUAL
+-= MINEQUAL
+*= STAREQUAL
+/= SLASHEQUAL
+%= PERCENTEQUAL
+&= AMPEREQUAL
+|= VBAREQUAL
+^= CIRCUMFLEXEQUAL
+<<= LEFTSHIFTEQUAL
+>>= RIGHTSHIFTEQUAL
+**= DOUBLESTAREQUAL
+// DOUBLESLASH
+//= DOUBLESLASHEQUAL
+-> RARROW
+"""
+
+opmap = {}
+for line in opmap_raw.splitlines():
+    if line:
+        op, name = line.split()
+        opmap[op] = getattr(token, name)
--- a/jedi/parser/pgen2/parse.py
+++ b/jedi/parser/pgen2/parse.py
@@ -0,0 +1,190 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+"""
+Parser engine for the grammar tables generated by pgen.
+
+The grammar table must be loaded first.
+
+See Parser/parser.c in the Python distribution for additional info on
+how this parsing engine works.
+"""
+
+# Local imports
+from . import token
+
+
+class ParseError(Exception):
+    """Exception to signal the parser is stuck."""
+
+    def __init__(self, msg, type, value, context):
+        Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
+                           (msg, type, value, context))
+        self.msg = msg
+        self.type = type
+        self.value = value
+        self.context = context
+
+
+class Parser(object):
+    """Parser engine.
+
+    The proper usage sequence is:
+
+    p = Parser(grammar, [converter])  # create instance
+    p.setup([start])                  # prepare for parsing
+    <for each input token>:
+        if p.addtoken(...):           # parse a token; may raise ParseError
+            break
+    root = p.rootnode                 # root of abstract syntax tree
+
+    A Parser instance may be reused by calling setup() repeatedly.
+
+    A Parser instance contains state pertaining to the current token
+    sequence, and should not be used concurrently by different threads
+    to parse separate token sequences.
+
+    See driver.py for how to get input tokens by tokenizing a file or
+    string.
+
+    Parsing is complete when addtoken() returns True; the root of the
+    abstract syntax tree can then be retrieved from the rootnode
+    instance variable.  When a syntax error occurs, addtoken() raises
+    the ParseError exception.  There is no error recovery; the parser
+    cannot be used after a syntax error was reported (but it can be
+    reinitialized by calling setup()).
+
+    """
+
+    def __init__(self, grammar, convert=None):
+        """Constructor.
+
+        The grammar argument is a grammar.Grammar instance; see the
+        grammar module for more information.
+
+        The parser is not ready yet for parsing; you must call the
+        setup() method to get it started.
+
+        The optional convert argument is a function mapping concrete
+        syntax tree nodes to abstract syntax tree nodes.  If not
+        given, no conversion is done and the syntax tree produced is
+        the concrete syntax tree.  If given, it must be a function of
+        two arguments, the first being the grammar (a grammar.Grammar
+        instance), and the second being the concrete syntax tree node
+        to be converted.  The syntax tree is converted from the bottom
+        up.
+
+        A concrete syntax tree node is a (type, value, context, nodes)
+        tuple, where type is the node type (a token or symbol number),
+        value is None for symbols and a string for tokens, context is
+        None or an opaque value used for error reporting (typically a
+        (lineno, offset) pair), and nodes is a list of children for
+        symbols, and None for tokens.
+
+        An abstract syntax tree node may be anything; this is entirely
+        up to the converter function.
+
+        """
+        self.grammar = grammar
+        self.convert = convert or (lambda grammar, node: node)
+
+        # Prepare for parsing.
+        start = self.grammar.start
+        # Each stack entry is a tuple: (dfa, state, node).
+        # A node is a tuple: (type, value, context, children),
+        # where children is a list of nodes or None, and context may be None.
+        newnode = (start, None, None, [])
+        stackentry = (self.grammar.dfas[start], 0, newnode)
+        self.stack = [stackentry]
+        self.rootnode = None
+        self.used_names = set()  # Aliased to self.rootnode.used_names in pop()
+
+    def addtoken(self, type, value, context):
+        """Add a token; return True iff this is the end of the program."""
+        # Map from token to label
+        ilabel = self.classify(type, value, context)
+        # Loop until the token is shifted; may raise exceptions
+        while True:
+            dfa, state, node = self.stack[-1]
+            states, first = dfa
+            arcs = states[state]
+            # Look for a state with this label
+            for i, newstate in arcs:
+                t, v = self.grammar.labels[i]
+                if ilabel == i:
+                    # Look it up in the list of labels
+                    assert t < 256
+                    # Shift a token; we're done with it
+                    self.shift(type, value, newstate, context)
+                    # Pop while we are in an accept-only state
+                    state = newstate
+                    while states[state] == [(0, state)]:
+                        self.pop()
+                        if not self.stack:
+                            # Done parsing!
+                            return True
+                        dfa, state, node = self.stack[-1]
+                        states, first = dfa
+                    # Done with this token
+                    return False
+                elif t >= 256:
+                    # See if it's a symbol and if we're in its first set
+                    itsdfa = self.grammar.dfas[t]
+                    itsstates, itsfirst = itsdfa
+                    if ilabel in itsfirst:
+                        # Push a symbol
+                        self.push(t, self.grammar.dfas[t], newstate, context)
+                        break  # To continue the outer while loop
+            else:
+                if (0, state) in arcs:
+                    # An accepting state, pop it and try something else
+                    self.pop()
+                    if not self.stack:
+                        # Done parsing, but another token is input
+                        raise ParseError("too much input",
+                                         type, value, context)
+                else:
+                    # No success finding a transition
+                    raise ParseError("bad input", type, value, context)
+
+    def classify(self, type, value, context):
+        """Turn a token into a label.  (Internal)"""
+        if type == token.NAME:
+            # Keep a listing of all used names
+            self.used_names.add(value)
+            # Check for reserved words
+            ilabel = self.grammar.keywords.get(value)
+            if ilabel is not None:
+                return ilabel
+        ilabel = self.grammar.tokens.get(type)
+        if ilabel is None:
+            raise ParseError("bad token", type, value, context)
+        return ilabel
+
+    def shift(self, type, value, newstate, context):
+        """Shift a token.  (Internal)"""
+        dfa, state, node = self.stack[-1]
+        newnode = (type, value, context, None)
+        newnode = self.convert(self.grammar, newnode)
+        if newnode is not None:
+            node[-1].append(newnode)
+        self.stack[-1] = (dfa, newstate, node)
+
+    def push(self, type, newdfa, newstate, context):
+        """Push a nonterminal.  (Internal)"""
+        dfa, state, node = self.stack[-1]
+        newnode = (type, None, context, [])
+        self.stack[-1] = (dfa, newstate, node)
+        self.stack.append((newdfa, 0, newnode))
+
+    def pop(self):
+        """Pop a nonterminal.  (Internal)"""
+        popdfa, popstate, popnode = self.stack.pop()
+        newnode = self.convert(self.grammar, popnode)
+        if newnode is not None:
+            if self.stack:
+                dfa, state, node = self.stack[-1]
+                node[-1].append(newnode)
+            else:
+                self.rootnode = newnode
+                self.rootnode.used_names = self.used_names
--- a/jedi/parser/pgen2/pgen.py
+++ b/jedi/parser/pgen2/pgen.py
@@ -0,0 +1,387 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+# Pgen imports
+from . import grammar, token, tokenize
+
+
+class ParserGenerator(object):
+    def __init__(self, filename, stream=None):
+        close_stream = None
+        if stream is None:
+            stream = open(filename)
+            close_stream = stream.close
+        self.filename = filename
+        self.stream = stream
+        self.generator = tokenize.generate_tokens(stream.readline)
+        self.gettoken()  # Initialize lookahead
+        self.dfas, self.startsymbol = self.parse()
+        if close_stream is not None:
+            close_stream()
+        self.first = {}  # map from symbol name to set of tokens
+        self.addfirstsets()
+
+    def make_grammar(self):
+        c = grammar.Grammar()
+        names = list(self.dfas.keys())
+        names.sort()
+        names.remove(self.startsymbol)
+        names.insert(0, self.startsymbol)
+        for name in names:
+            i = 256 + len(c.symbol2number)
+            c.symbol2number[name] = i
+            c.number2symbol[i] = name
+        for name in names:
+            dfa = self.dfas[name]
+            states = []
+            for state in dfa:
+                arcs = []
+                for label, next in state.arcs.items():
+                    arcs.append((self.make_label(c, label), dfa.index(next)))
+                if state.isfinal:
+                    arcs.append((0, dfa.index(state)))
+                states.append(arcs)
+            c.states.append(states)
+            c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
+        c.start = c.symbol2number[self.startsymbol]
+        return c
+
+    def make_first(self, c, name):
+        rawfirst = self.first[name]
+        first = {}
+        for label in rawfirst:
+            ilabel = self.make_label(c, label)
+            ##assert ilabel not in first # XXX failed on <> ... !=
+            first[ilabel] = 1
+        return first
+
+    def make_label(self, c, label):
+        # XXX Maybe this should be a method on a subclass of converter?
+        ilabel = len(c.labels)
+        if label[0].isalpha():
+            # Either a symbol name or a named token
+            if label in c.symbol2number:
+                # A symbol name (a non-terminal)
+                if label in c.symbol2label:
+                    return c.symbol2label[label]
+                else:
+                    c.labels.append((c.symbol2number[label], None))
+                    c.symbol2label[label] = ilabel
+                    return ilabel
+            else:
+                # A named token (NAME, NUMBER, STRING)
+                itoken = getattr(token, label, None)
+                assert isinstance(itoken, int), label
+                assert itoken in token.tok_name, label
+                if itoken in c.tokens:
+                    return c.tokens[itoken]
+                else:
+                    c.labels.append((itoken, None))
+                    c.tokens[itoken] = ilabel
+                    return ilabel
+        else:
+            # Either a keyword or an operator
+            assert label[0] in ('"', "'"), label
+            value = eval(label)
+            if value[0].isalpha():
+                # A keyword
+                if value in c.keywords:
+                    return c.keywords[value]
+                else:
+                    c.labels.append((token.NAME, value))
+                    c.keywords[value] = ilabel
+                    return ilabel
+            else:
+                # An operator (any non-numeric token)
+                itoken = grammar.opmap[value]  # Fails if unknown token
+                if itoken in c.tokens:
+                    return c.tokens[itoken]
+                else:
+                    c.labels.append((itoken, None))
+                    c.tokens[itoken] = ilabel
+                    return ilabel
+
+    def addfirstsets(self):
+        names = list(self.dfas.keys())
+        names.sort()
+        for name in names:
+            if name not in self.first:
+                self.calcfirst(name)
+            #print name, self.first[name].keys()
+
+    def calcfirst(self, name):
+        dfa = self.dfas[name]
+        self.first[name] = None  # dummy to detect left recursion
+        state = dfa[0]
+        totalset = {}
+        overlapcheck = {}
+        for label, next in state.arcs.items():
+            if label in self.dfas:
+                if label in self.first:
+                    fset = self.first[label]
+                    if fset is None:
+                        raise ValueError("recursion for rule %r" % name)
+                else:
+                    self.calcfirst(label)
+                    fset = self.first[label]
+                totalset.update(fset)
+                overlapcheck[label] = fset
+            else:
+                totalset[label] = 1
+                overlapcheck[label] = {label: 1}
+        inverse = {}
+        for label, itsfirst in overlapcheck.items():
+            for symbol in itsfirst:
+                if symbol in inverse:
+                    raise ValueError("rule %s is ambiguous; %s is in the"
+                                     " first sets of %s as well as %s" %
+                                     (name, symbol, label, inverse[symbol]))
+                inverse[symbol] = label
+        self.first[name] = totalset
+
+    def parse(self):
+        dfas = {}
+        startsymbol = None
+        # MSTART: (NEWLINE | RULE)* ENDMARKER
+        while self.type != token.ENDMARKER:
+            while self.type == token.NEWLINE:
+                self.gettoken()
+            # RULE: NAME ':' RHS NEWLINE
+            name = self.expect(token.NAME)
+            self.expect(token.OP, ":")
+            a, z = self.parse_rhs()
+            self.expect(token.NEWLINE)
+            #self.dump_nfa(name, a, z)
+            dfa = self.make_dfa(a, z)
+            #self.dump_dfa(name, dfa)
+            # oldlen = len(dfa)
+            self.simplify_dfa(dfa)
+            # newlen = len(dfa)
+            dfas[name] = dfa
+            #print name, oldlen, newlen
+            if startsymbol is None:
+                startsymbol = name
+        return dfas, startsymbol
+
+    def make_dfa(self, start, finish):
+        # To turn an NFA into a DFA, we define the states of the DFA
+        # to correspond to *sets* of states of the NFA.  Then do some
+        # state reduction.  Let's represent sets as dicts with 1 for
+        # values.
+        assert isinstance(start, NFAState)
+        assert isinstance(finish, NFAState)
+
+        def closure(state):
+            base = {}
+            addclosure(state, base)
+            return base
+
+        def addclosure(state, base):
+            assert isinstance(state, NFAState)
+            if state in base:
+                return
+            base[state] = 1
+            for label, next in state.arcs:
+                if label is None:
+                    addclosure(next, base)
+
+        states = [DFAState(closure(start), finish)]
+        for state in states:  # NB states grows while we're iterating
+            arcs = {}
+            for nfastate in state.nfaset:
+                for label, next in nfastate.arcs:
+                    if label is not None:
+                        addclosure(next, arcs.setdefault(label, {}))
+            for label, nfaset in arcs.items():
+                for st in states:
+                    if st.nfaset == nfaset:
+                        break
+                else:
+                    st = DFAState(nfaset, finish)
+                    states.append(st)
+                state.addarc(st, label)
+        return states  # List of DFAState instances; first one is start
+
+    def dump_nfa(self, name, start, finish):
+        print("Dump of NFA for", name)
+        todo = [start]
+        for i, state in enumerate(todo):
+            print("  State", i, state is finish and "(final)" or "")
+            for label, next in state.arcs:
+                if next in todo:
+                    j = todo.index(next)
+                else:
+                    j = len(todo)
+                    todo.append(next)
+                if label is None:
+                    print("    -> %d" % j)
+                else:
+                    print("    %s -> %d" % (label, j))
+
+    def dump_dfa(self, name, dfa):
+        print("Dump of DFA for", name)
+        for i, state in enumerate(dfa):
+            print("  State", i, state.isfinal and "(final)" or "")
+            for label, next in state.arcs.items():
+                print("    %s -> %d" % (label, dfa.index(next)))
+
+    def simplify_dfa(self, dfa):
+        # This is not theoretically optimal, but works well enough.
+        # Algorithm: repeatedly look for two states that have the same
+        # set of arcs (same labels pointing to the same nodes) and
+        # unify them, until things stop changing.
+
+        # dfa is a list of DFAState instances
+        changes = True
+        while changes:
+            changes = False
+            for i, state_i in enumerate(dfa):
+                for j in range(i + 1, len(dfa)):
+                    state_j = dfa[j]
+                    if state_i == state_j:
+                        #print "  unify", i, j
+                        del dfa[j]
+                        for state in dfa:
+                            state.unifystate(state_j, state_i)
+                        changes = True
+                        break
+
+    def parse_rhs(self):
+        # RHS: ALT ('|' ALT)*
+        a, z = self.parse_alt()
+        if self.value != "|":
+            return a, z
+        else:
+            aa = NFAState()
+            zz = NFAState()
+            aa.addarc(a)
+            z.addarc(zz)
+            while self.value == "|":
+                self.gettoken()
+                a, z = self.parse_alt()
+                aa.addarc(a)
+                z.addarc(zz)
+            return aa, zz
+
+    def parse_alt(self):
+        # ALT: ITEM+
+        a, b = self.parse_item()
+        while (self.value in ("(", "[") or
+               self.type in (token.NAME, token.STRING)):
+            c, d = self.parse_item()
+            b.addarc(c)
+            b = d
+        return a, b
+
+    def parse_item(self):
+        # ITEM: '[' RHS ']' | ATOM ['+' | '*']
+        if self.value == "[":
+            self.gettoken()
+            a, z = self.parse_rhs()
+            self.expect(token.OP, "]")
+            a.addarc(z)
+            return a, z
+        else:
+            a, z = self.parse_atom()
+            value = self.value
+            if value not in ("+", "*"):
+                return a, z
+            self.gettoken()
+            z.addarc(a)
+            if value == "+":
+                return a, z
+            else:
+                return a, a
+
+    def parse_atom(self):
+        # ATOM: '(' RHS ')' | NAME | STRING
+        if self.value == "(":
+            self.gettoken()
+            a, z = self.parse_rhs()
+            self.expect(token.OP, ")")
+            return a, z
+        elif self.type in (token.NAME, token.STRING):
+            a = NFAState()
+            z = NFAState()
+            a.addarc(z, self.value)
+            self.gettoken()
+            return a, z
+        else:
+            self.raise_error("expected (...) or NAME or STRING, got %s/%s",
+                             self.type, self.value)
+
+    def expect(self, type, value=None):
+        if self.type != type or (value is not None and self.value != value):
+            self.raise_error("expected %s/%s, got %s/%s",
+                             type, value, self.type, self.value)
+        value = self.value
+        self.gettoken()
+        return value
+
+    def gettoken(self):
+        tup = next(self.generator)
+        while tup[0] in (tokenize.COMMENT, tokenize.NL):
+            tup = next(self.generator)
+        self.type, self.value, self.begin, self.end, self.line = tup
+        #print token.tok_name[self.type], repr(self.value)
+
+    def raise_error(self, msg, *args):
+        if args:
+            try:
+                msg = msg % args
+            except:
+                msg = " ".join([msg] + list(map(str, args)))
+        raise SyntaxError(msg, (self.filename, self.end[0],
+                                self.end[1], self.line))
+
+
+class NFAState(object):
+    def __init__(self):
+        self.arcs = []  # list of (label, NFAState) pairs
+
+    def addarc(self, next, label=None):
+        assert label is None or isinstance(label, str)
+        assert isinstance(next, NFAState)
+        self.arcs.append((label, next))
+
+
+class DFAState(object):
+    def __init__(self, nfaset, final):
+        assert isinstance(nfaset, dict)
+        assert isinstance(next(iter(nfaset)), NFAState)
+        assert isinstance(final, NFAState)
+        self.nfaset = nfaset
+        self.isfinal = final in nfaset
+        self.arcs = {}  # map from label to DFAState
+
+    def addarc(self, next, label):
+        assert isinstance(label, str)
+        assert label not in self.arcs
+        assert isinstance(next, DFAState)
+        self.arcs[label] = next
+
+    def unifystate(self, old, new):
+        for label, next in self.arcs.items():
+            if next is old:
+                self.arcs[label] = new
+
+    def __eq__(self, other):
+        # Equality test -- ignore the nfaset instance variable
+        assert isinstance(other, DFAState)
+        if self.isfinal != other.isfinal:
+            return False
+        # Can't just return self.arcs == other.arcs, because that
+        # would invoke this method recursively, with cycles...
+        if len(self.arcs) != len(other.arcs):
+            return False
+        for label, next in self.arcs.items():
+            if next is not other.arcs.get(label):
+                return False
+        return True
+
+    __hash__ = None  # For Py3 compatibility.
+
+
+def generate_grammar(filename="Grammar.txt"):
+    p = ParserGenerator(filename)
+    return p.make_grammar()
--- a/jedi/parser/pgen2/token.py
+++ b/jedi/parser/pgen2/token.py
@@ -0,0 +1,82 @@
+#! /usr/bin/python3.4
+
+"""Token constants (from "token.h")."""
+
+#  Taken from Python (r53757) and modified to include some tokens
+#   originally monkeypatched in by pgen2.tokenize
+
+#--start constants--
+ENDMARKER = 0
+NAME = 1
+NUMBER = 2
+STRING = 3
+NEWLINE = 4
+INDENT = 5
+DEDENT = 6
+LPAR = 7
+RPAR = 8
+LSQB = 9
+RSQB = 10
+COLON = 11
+COMMA = 12
+SEMI = 13
+PLUS = 14
+MINUS = 15
+STAR = 16
+SLASH = 17
+VBAR = 18
+AMPER = 19
+LESS = 20
+GREATER = 21
+EQUAL = 22
+DOT = 23
+PERCENT = 24
+BACKQUOTE = 25
+LBRACE = 26
+RBRACE = 27
+EQEQUAL = 28
+NOTEQUAL = 29
+LESSEQUAL = 30
+GREATEREQUAL = 31
+TILDE = 32
+CIRCUMFLEX = 33
+LEFTSHIFT = 34
+RIGHTSHIFT = 35
+DOUBLESTAR = 36
+PLUSEQUAL = 37
+MINEQUAL = 38
+STAREQUAL = 39
+SLASHEQUAL = 40
+PERCENTEQUAL = 41
+AMPEREQUAL = 42
+VBAREQUAL = 43
+CIRCUMFLEXEQUAL = 44
+LEFTSHIFTEQUAL = 45
+RIGHTSHIFTEQUAL = 46
+DOUBLESTAREQUAL = 47
+DOUBLESLASH = 48
+DOUBLESLASHEQUAL = 49
+AT = 50
+OP = 51
+COMMENT = 52
+NL = 53
+RARROW = 54
+ERRORTOKEN = 55
+N_TOKENS = 56
+NT_OFFSET = 256
+#--end constants--
+
+tok_name = {}
+for _name, _value in list(globals().items()):
+    if type(_value) is type(0):
+        tok_name[_value] = _name
+
+
+def ISTERMINAL(x):
+    return x < NT_OFFSET
+
+def ISNONTERMINAL(x):
+    return x >= NT_OFFSET
+
+def ISEOF(x):
+    return x == ENDMARKER
--- a/jedi/parser/pgen2/tokenize.py
+++ b/jedi/parser/pgen2/tokenize.py
@@ -0,0 +1,503 @@
+# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
+# All rights reserved.
+
+"""Tokenization help for Python programs.
+
+generate_tokens(readline) is a generator that breaks a stream of
+text into Python tokens.  It accepts a readline-like method which is called
+repeatedly to get the next line of input (or "" for EOF).  It generates
+5-tuples with these members:
+
+    the token type (see token.py)
+    the token (a string)
+    the starting (row, column) indices of the token (a 2-tuple of ints)
+    the ending (row, column) indices of the token (a 2-tuple of ints)
+    the original line (string)
+
+It is designed to match the working of the Python tokenizer exactly, except
+that it produces COMMENT tokens for comments and gives type OP for all
+operators
+
+Older entry points
+    tokenize_loop(readline, tokeneater)
+    tokenize(readline, tokeneater=printtoken)
+are the same, except instead of generating tokens, tokeneater is a callback
+function to which the 5 fields described above are passed as 5 arguments,
+each time a new token is found."""
+
+__author__ = 'Ka-Ping Yee <ping@lfw.org>'
+__credits__ = \
+    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
+
+import string, re
+from codecs import BOM_UTF8, lookup
+
+from .token import *
+from . import token
+
+__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
+           "generate_tokens", "untokenize"]
+del token
+
+try:
+    bytes
+except NameError:
+    # Support bytes type in Python <= 2.5, so 2to3 turns itself into
+    # valid Python 3 code.
+    bytes = str
+
+def group(*choices): return '(' + '|'.join(choices) + ')'
+def any(*choices): return group(*choices) + '*'
+def maybe(*choices): return group(*choices) + '?'
+
+Whitespace = r'[ \f\t]*'
+Comment = r'#[^\r\n]*'
+Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
+Name = r'[a-zA-Z_]\w*'
+
+Binnumber = r'0[bB][01]*'
+Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
+Octnumber = r'0[oO]?[0-7]*[lL]?'
+Decnumber = r'[1-9]\d*[lL]?'
+Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
+Exponent = r'[eE][-+]?\d+'
+Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
+Expfloat = r'\d+' + Exponent
+Floatnumber = group(Pointfloat, Expfloat)
+Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
+Number = group(Imagnumber, Floatnumber, Intnumber)
+
+# Tail end of ' string.
+Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
+# Tail end of " string.
+Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
+# Tail end of ''' string.
+Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
+# Tail end of """ string.
+Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
+Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
+# Single-line ' or " string.
+String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+
+# Because of leftmost-then-longest match semantics, be sure to put the
+# longest operators first (e.g., if = came before ==, == would get
+# recognized as two instances of =).
+Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
+                 r"//=?", r"->",
+                 r"[+\-*/%&|^=<>]=?",
+                 r"~")
+
+Bracket = '[][(){}]'
+Special = group(r'\r?\n', r'[:;.,`@]')
+Funny = group(Operator, Bracket, Special)
+
+PlainToken = group(Number, Funny, String, Name)
+Token = Ignore + PlainToken
+
+# First (or only) line of ' or " string.
+ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+                group("'", r'\\\r?\n'),
+                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+                group('"', r'\\\r?\n'))
+PseudoExtras = group(r'\\\r?\n', Comment, Triple)
+PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
+
+tokenprog, pseudoprog, single3prog, double3prog = list(map(
+    re.compile, (Token, PseudoToken, Single3, Double3)))
+endprogs = {"'": re.compile(Single), '"': re.compile(Double),
+            "'''": single3prog, '"""': double3prog,
+            "r'''": single3prog, 'r"""': double3prog,
+            "u'''": single3prog, 'u"""': double3prog,
+            "b'''": single3prog, 'b"""': double3prog,
+            "ur'''": single3prog, 'ur"""': double3prog,
+            "br'''": single3prog, 'br"""': double3prog,
+            "R'''": single3prog, 'R"""': double3prog,
+            "U'''": single3prog, 'U"""': double3prog,
+            "B'''": single3prog, 'B"""': double3prog,
+            "uR'''": single3prog, 'uR"""': double3prog,
+            "Ur'''": single3prog, 'Ur"""': double3prog,
+            "UR'''": single3prog, 'UR"""': double3prog,
+            "bR'''": single3prog, 'bR"""': double3prog,
+            "Br'''": single3prog, 'Br"""': double3prog,
+            "BR'''": single3prog, 'BR"""': double3prog,
+            'r': None, 'R': None,
+            'u': None, 'U': None,
+            'b': None, 'B': None}
+
+triple_quoted = {}
+for t in ("'''", '"""',
+          "r'''", 'r"""', "R'''", 'R"""',
+          "u'''", 'u"""', "U'''", 'U"""',
+          "b'''", 'b"""', "B'''", 'B"""',
+          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
+          "uR'''", 'uR"""', "UR'''", 'UR"""',
+          "br'''", 'br"""', "Br'''", 'Br"""',
+          "bR'''", 'bR"""', "BR'''", 'BR"""',):
+    triple_quoted[t] = t
+single_quoted = {}
+for t in ("'", '"',
+          "r'", 'r"', "R'", 'R"',
+          "u'", 'u"', "U'", 'U"',
+          "b'", 'b"', "B'", 'B"',
+          "ur'", 'ur"', "Ur'", 'Ur"',
+          "uR'", 'uR"', "UR'", 'UR"',
+          "br'", 'br"', "Br'", 'Br"',
+          "bR'", 'bR"', "BR'", 'BR"', ):
+    single_quoted[t] = t
+
+tabsize = 8
+
+class TokenError(Exception): pass
+
+class StopTokenizing(Exception): pass
+
+def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
+    (srow, scol) = xxx_todo_changeme
+    (erow, ecol) = xxx_todo_changeme1
+    print("%d,%d-%d,%d:\t%s\t%s" % \
+        (srow, scol, erow, ecol, tok_name[type], repr(token)))
+
+def tokenize(readline, tokeneater=printtoken):
+    """
+    The tokenize() function accepts two parameters: one representing the
+    input stream, and one providing an output mechanism for tokenize().
+
+    The first parameter, readline, must be a callable object which provides
+    the same interface as the readline() method of built-in file objects.
+    Each call to the function should return one line of input as a string.
+
+    The second parameter, tokeneater, must also be a callable object. It is
+    called once for each token, with five arguments, corresponding to the
+    tuples generated by generate_tokens().
+    """
+    try:
+        tokenize_loop(readline, tokeneater)
+    except StopTokenizing:
+        pass
+
+# backwards compatible interface
+def tokenize_loop(readline, tokeneater):
+    for token_info in generate_tokens(readline):
+        tokeneater(*token_info)
+
+class Untokenizer:
+
+    def __init__(self):
+        self.tokens = []
+        self.prev_row = 1
+        self.prev_col = 0
+
+    def add_whitespace(self, start):
+        row, col = start
+        assert row <= self.prev_row
+        col_offset = col - self.prev_col
+        if col_offset:
+            self.tokens.append(" " * col_offset)
+
+    def untokenize(self, iterable):
+        for t in iterable:
+            if len(t) == 2:
+                self.compat(t, iterable)
+                break
+            tok_type, token, start, end, line = t
+            self.add_whitespace(start)
+            self.tokens.append(token)
+            self.prev_row, self.prev_col = end
+            if tok_type in (NEWLINE, NL):
+                self.prev_row += 1
+                self.prev_col = 0
+        return "".join(self.tokens)
+
+    def compat(self, token, iterable):
+        startline = False
+        indents = []
+        toks_append = self.tokens.append
+        toknum, tokval = token
+        if toknum in (NAME, NUMBER):
+            tokval += ' '
+        if toknum in (NEWLINE, NL):
+            startline = True
+        for tok in iterable:
+            toknum, tokval = tok[:2]
+
+            if toknum in (NAME, NUMBER):
+                tokval += ' '
+
+            if toknum == INDENT:
+                indents.append(tokval)
+                continue
+            elif toknum == DEDENT:
+                indents.pop()
+                continue
+            elif toknum in (NEWLINE, NL):
+                startline = True
+            elif startline and indents:
+                toks_append(indents[-1])
+                startline = False
+            toks_append(tokval)
+
+cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
+
+def _get_normal_name(orig_enc):
+    """Imitates get_normal_name in tokenizer.c."""
+    # Only care about the first 12 characters.
+    enc = orig_enc[:12].lower().replace("_", "-")
+    if enc == "utf-8" or enc.startswith("utf-8-"):
+        return "utf-8"
+    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+        return "iso-8859-1"
+    return orig_enc
+
+def detect_encoding(readline):
+    """
+    The detect_encoding() function is used to detect the encoding that should
+    be used to decode a Python source file. It requires one argument, readline,
+    in the same way as the tokenize() generator.
+
+    It will call readline a maximum of twice, and return the encoding used
+    (as a string) and a list of any lines (left as bytes) it has read
+    in.
+
+    It detects the encoding from the presence of a utf-8 bom or an encoding
+    cookie as specified in pep-0263. If both a bom and a cookie are present, but
+    disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
+    charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
+    'utf-8-sig' is returned.
+
+    If no encoding is specified, then the default of 'utf-8' will be returned.
+    """
+    bom_found = False
+    encoding = None
+    default = 'utf-8'
+    def read_or_stop():
+        try:
+            return readline()
+        except StopIteration:
+            return bytes()
+
+    def find_cookie(line):
+        try:
+            line_string = line.decode('ascii')
+        except UnicodeDecodeError:
+            return None
+        match = cookie_re.match(line_string)
+        if not match:
+            return None
+        encoding = _get_normal_name(match.group(1))
+        try:
+            codec = lookup(encoding)
+        except LookupError:
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError("unknown encoding: " + encoding)
+
+        if bom_found:
+            if codec.name != 'utf-8':
+                # This behaviour mimics the Python interpreter
+                raise SyntaxError('encoding problem: utf-8')
+            encoding += '-sig'
+        return encoding
+
+    first = read_or_stop()
+    if first.startswith(BOM_UTF8):
+        bom_found = True
+        first = first[3:]
+        default = 'utf-8-sig'
+    if not first:
+        return default, []
+
+    encoding = find_cookie(first)
+    if encoding:
+        return encoding, [first]
+    if not blank_re.match(first):
+        return default, [first]
+
+    second = read_or_stop()
+    if not second:
+        return default, [first]
+
+    encoding = find_cookie(second)
+    if encoding:
+        return encoding, [first, second]
+
+    return default, [first, second]
+
+def untokenize(iterable):
+    """Transform tokens back into Python source code.
+
+    Each element returned by the iterable must be a token sequence
+    with at least two elements, a token number and token value.  If
+    only two tokens are passed, the resulting output is poor.
+
+    Round-trip invariant for full input:
+        Untokenized source will match input source exactly
+
+    Round-trip invariant for limited intput:
+        # Output text will tokenize the back to the input
+        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
+        newcode = untokenize(t1)
+        readline = iter(newcode.splitlines(1)).next
+        t2 = [tok[:2] for tokin generate_tokens(readline)]
+        assert t1 == t2
+    """
+    ut = Untokenizer()
+    return ut.untokenize(iterable)
+
+def generate_tokens(readline):
+    """
+    The generate_tokens() generator requires one argument, readline, which
+    must be a callable object which provides the same interface as the
+    readline() method of built-in file objects. Each call to the function
+    should return one line of input as a string.  Alternately, readline
+    can be a callable function terminating with StopIteration:
+        readline = open(myfile).next    # Example of alternate readline
+
+    The generator produces 5-tuples with these members: the token type; the
+    token string; a 2-tuple (srow, scol) of ints specifying the row and
+    column where the token begins in the source; a 2-tuple (erow, ecol) of
+    ints specifying the row and column where the token ends in the source;
+    and the line on which the token was found. The line passed is the
+    logical line; continuation lines are included.
+    """
+    lnum = parenlev = continued = 0
+    namechars, numchars = string.ascii_letters + '_', '0123456789'
+    contstr, needcont = '', 0
+    contline = None
+    indents = [0]
+
+    while 1:                                   # loop over lines in stream
+        try:
+            line = readline()
+        except StopIteration:
+            line = ''
+        lnum = lnum + 1
+        pos, max = 0, len(line)
+
+        if contstr:                            # continued string
+            if not line:
+                raise TokenError("EOF in multi-line string", strstart)
+            endmatch = endprog.match(line)
+            if endmatch:
+                pos = end = endmatch.end(0)
+                yield (STRING, contstr + line[:end],
+                       strstart, (lnum, end), contline + line)
+                contstr, needcont = '', 0
+                contline = None
+            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
+                yield (ERRORTOKEN, contstr + line,
+                           strstart, (lnum, len(line)), contline)
+                contstr = ''
+                contline = None
+                continue
+            else:
+                contstr = contstr + line
+                contline = contline + line
+                continue
+
+        elif parenlev == 0 and not continued:  # new statement
+            if not line: break
+            column = 0
+            while pos < max:                   # measure leading whitespace
+                if line[pos] == ' ': column = column + 1
+                elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
+                elif line[pos] == '\f': column = 0
+                else: break
+                pos = pos + 1
+            if pos == max: break
+
+            if line[pos] in '#\r\n':           # skip comments or blank lines
+                if line[pos] == '#':
+                    comment_token = line[pos:].rstrip('\r\n')
+                    nl_pos = pos + len(comment_token)
+                    yield (COMMENT, comment_token,
+                           (lnum, pos), (lnum, pos + len(comment_token)), line)
+                    yield (NL, line[nl_pos:],
+                           (lnum, nl_pos), (lnum, len(line)), line)
+                else:
+                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
+                           (lnum, pos), (lnum, len(line)), line)
+                continue
+
+            if column > indents[-1]:           # count indents or dedents
+                indents.append(column)
+                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
+            while column < indents[-1]:
+                if column not in indents:
+                    raise IndentationError(
+                        "unindent does not match any outer indentation level",
+                        ("<tokenize>", lnum, pos, line))
+                indents = indents[:-1]
+                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
+
+        else:                                  # continued statement
+            if not line:
+                raise TokenError("EOF in multi-line statement", (lnum, 0))
+            continued = 0
+
+        while pos < max:
+            pseudomatch = pseudoprog.match(line, pos)
+            if pseudomatch:                                # scan for tokens
+                start, end = pseudomatch.span(1)
+                spos, epos, pos = (lnum, start), (lnum, end), end
+                token, initial = line[start:end], line[start]
+
+                if initial in numchars or \
+                   (initial == '.' and token != '.'):      # ordinary number
+                    yield (NUMBER, token, spos, epos, line)
+                elif initial in '\r\n':
+                    newline = NEWLINE
+                    if parenlev > 0:
+                        newline = NL
+                    yield (newline, token, spos, epos, line)
+                elif initial == '#':
+                    assert not token.endswith("\n")
+                    yield (COMMENT, token, spos, epos, line)
+                elif token in triple_quoted:
+                    endprog = endprogs[token]
+                    endmatch = endprog.match(line, pos)
+                    if endmatch:                           # all on one line
+                        pos = endmatch.end(0)
+                        token = line[start:pos]
+                        yield (STRING, token, spos, (lnum, pos), line)
+                    else:
+                        strstart = (lnum, start)           # multiple lines
+                        contstr = line[start:]
+                        contline = line
+                        break
+                elif initial in single_quoted or \
+                    token[:2] in single_quoted or \
+                    token[:3] in single_quoted:
+                    if token[-1] == '\n':                  # continued string
+                        strstart = (lnum, start)
+                        endprog = (endprogs[initial] or endprogs[token[1]] or
+                                   endprogs[token[2]])
+                        contstr, needcont = line[start:], 1
+                        contline = line
+                        break
+                    else:                                  # ordinary string
+                        yield (STRING, token, spos, epos, line)
+                elif initial in namechars:                 # ordinary name
+                    yield (NAME, token, spos, epos, line)
+                elif initial == '\\':                      # continued stmt
+                    # This yield is new; needed for better idempotency:
+                    yield (NL, token, spos, (lnum, pos), line)
+                    continued = 1
+                else:
+                    if initial in '([{': parenlev = parenlev + 1
+                    elif initial in ')]}': parenlev = parenlev - 1
+                    yield (OP, token, spos, epos, line)
+            else:
+                yield (ERRORTOKEN, line[pos],
+                           (lnum, pos), (lnum, pos+1), line)
+                pos = pos + 1
+
+    for indent in indents[1:]:                 # pop remaining indent levels
+        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
+    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+
+if __name__ == '__main__':                     # testing
+    import sys
+    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
+    else: tokenize(sys.stdin.readline)
--- a/jedi/parser/pytree.py
+++ b/jedi/parser/pytree.py
@@ -0,0 +1,456 @@
+# Copyright 2006 Google, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+"""
+Python parse tree definitions.
+
+This is a very concrete parse tree; we need to keep every token and
+even the comments and whitespace between tokens.
+
+There's also a pattern matching implementation here.
+"""
+
+__author__ = "Guido van Rossum <guido@python.org>"
+
+import sys
+import os
+
+from . import pgen2
+
+HUGE = 0x7FFFFFFF  # maximum repeat count, default max
+
+_type_reprs = {}
+
+
+# The grammar file
+_GRAMMAR_FILE = os.path.join(os.path.dirname(__file__), "grammar.txt")
+
+
+class Symbols(object):
+
+    def __init__(self, grammar):
+        """Initializer.
+
+        Creates an attribute for each grammar symbol (nonterminal),
+        whose value is the symbol's type (an int >= 256).
+        """
+        for name, symbol in grammar.symbol2number.items():
+            setattr(self, name, symbol)
+
+
+python_grammar = pgen2.load_grammar(_GRAMMAR_FILE)
+
+python_symbols = Symbols(python_grammar)
+
+python_grammar_no_print_statement = python_grammar.copy()
+del python_grammar_no_print_statement.keywords["print"]
+
+
+def type_repr(type_num):
+    global _type_reprs
+    if not _type_reprs:
+        # printing tokens is possible but not as useful
+        # from .pgen2 import token // token.__dict__.items():
+        for name, val in python_symbols.__dict__.items():
+            if type(val) == int:
+                _type_reprs[val] = name
+    return _type_reprs.setdefault(type_num, type_num)
+
+
+class Base(object):
+
+    """
+    Abstract base class for Node and Leaf.
+
+    This provides some default functionality and boilerplate using the
+    template pattern.
+
+    A node may be a subnode of at most one parent.
+    """
+
+    # Default values for instance variables
+    type = None    # int: token number (< 256) or symbol number (>= 256)
+    parent = None  # Parent node pointer, or None
+    children = ()  # Tuple of subnodes
+    was_changed = False
+    was_checked = False
+
+    def __new__(cls, *args, **kwds):
+        """Constructor that prevents Base from being instantiated."""
+        assert cls is not Base, "Cannot instantiate Base"
+        return object.__new__(cls)
+
+    def __eq__(self, other):
+        """
+        Compare two nodes for equality.
+
+        This calls the method _eq().
+        """
+        if self.__class__ is not other.__class__:
+            return NotImplemented
+        return self._eq(other)
+
+    __hash__ = None  # For Py3 compatibility.
+
+    def __ne__(self, other):
+        """
+        Compare two nodes for inequality.
+
+        This calls the method _eq().
+        """
+        if self.__class__ is not other.__class__:
+            return NotImplemented
+        return not self._eq(other)
+
+    def _eq(self, other):
+        """
+        Compare two nodes for equality.
+
+        This is called by __eq__ and __ne__.  It is only called if the two nodes
+        have the same type.  This must be implemented by the concrete subclass.
+        Nodes should be considered equal if they have the same structure,
+        ignoring the prefix string and other context information.
+        """
+        raise NotImplementedError
+
+    def clone(self):
+        """
+        Return a cloned (deep) copy of self.
+
+        This must be implemented by the concrete subclass.
+        """
+        raise NotImplementedError
+
+    def post_order(self):
+        """
+        Return a post-order iterator for the tree.
+
+        This must be implemented by the concrete subclass.
+        """
+        raise NotImplementedError
+
+    def pre_order(self):
+        """
+        Return a pre-order iterator for the tree.
+
+        This must be implemented by the concrete subclass.
+        """
+        raise NotImplementedError
+
+    def replace(self, new):
+        """Replace this node with a new one in the parent."""
+        assert self.parent is not None, str(self)
+        assert new is not None
+        if not isinstance(new, list):
+            new = [new]
+        l_children = []
+        found = False
+        for ch in self.parent.children:
+            if ch is self:
+                assert not found, (self.parent.children, self, new)
+                if new is not None:
+                    l_children.extend(new)
+                found = True
+            else:
+                l_children.append(ch)
+        assert found, (self.children, self, new)
+        self.parent.changed()
+        self.parent.children = l_children
+        for x in new:
+            x.parent = self.parent
+        self.parent = None
+
+    def get_lineno(self):
+        """Return the line number which generated the invocant node."""
+        node = self
+        while not isinstance(node, Leaf):
+            if not node.children:
+                return
+            node = node.children[0]
+        return node.lineno
+
+    def changed(self):
+        if self.parent:
+            self.parent.changed()
+        self.was_changed = True
+
+    def remove(self):
+        """
+        Remove the node from the tree. Returns the position of the node in its
+        parent's children before it was removed.
+        """
+        if self.parent:
+            for i, node in enumerate(self.parent.children):
+                if node is self:
+                    self.parent.changed()
+                    del self.parent.children[i]
+                    self.parent = None
+                    return i
+
+    @property
+    def next_sibling(self):
+        """
+        The node immediately following the invocant in their parent's children
+        list. If the invocant does not have a next sibling, it is None
+        """
+        if self.parent is None:
+            return None
+
+        # Can't use index(); we need to test by identity
+        for i, child in enumerate(self.parent.children):
+            if child is self:
+                try:
+                    return self.parent.children[i + 1]
+                except IndexError:
+                    return None
+
+    @property
+    def prev_sibling(self):
+        """
+        The node immediately preceding the invocant in their parent's children
+        list. If the invocant does not have a previous sibling, it is None.
+        """
+        if self.parent is None:
+            return None
+
+        # Can't use index(); we need to test by identity
+        for i, child in enumerate(self.parent.children):
+            if child is self:
+                if i == 0:
+                    return None
+                return self.parent.children[i - 1]
+
+    def leaves(self):
+        for child in self.children:
+            for leave in child.leaves():
+                yield leave
+
+    def depth(self):
+        if self.parent is None:
+            return 0
+        return 1 + self.parent.depth()
+
+    def get_suffix(self):
+        """
+        Return the string immediately following the invocant node. This is
+        effectively equivalent to node.next_sibling.prefix
+        """
+        next_sib = self.next_sibling
+        if next_sib is None:
+            return ""
+        return next_sib.prefix
+
+    if sys.version_info < (3, 0):
+        def __str__(self):
+            return str(self).encode("ascii")
+
+
+class Node(Base):
+    """Concrete implementation for interior nodes."""
+
+    def __init__(self, type, children,
+                 context=None,
+                 prefix=None,
+                 fixers_applied=None):
+        """
+        Initializer.
+
+        Takes a type constant (a symbol number >= 256), a sequence of
+        child nodes, and an optional context keyword argument.
+
+        As a side effect, the parent pointers of the children are updated.
+        """
+        assert type >= 256, type
+        self.type = type
+        self.children = list(children)
+        for ch in self.children:
+            assert ch.parent is None, repr(ch)
+            ch.parent = self
+        if prefix is not None:
+            self.prefix = prefix
+        if fixers_applied:
+            self.fixers_applied = fixers_applied[:]
+        else:
+            self.fixers_applied = None
+
+    def __repr__(self):
+        """Return a canonical string representation."""
+        return "%s(%s, %r)" % (self.__class__.__name__,
+                               type_repr(self.type),
+                               self.children)
+
+    def __unicode__(self):
+        """
+        Return a pretty string representation.
+
+        This reproduces the input source exactly.
+        """
+        return "".join(map(str, self.children))
+
+    if sys.version_info > (3, 0):
+        __str__ = __unicode__
+
+    def _eq(self, other):
+        """Compare two nodes for equality."""
+        return (self.type, self.children) == (other.type, other.children)
+
+    def clone(self):
+        """Return a cloned (deep) copy of self."""
+        return Node(self.type, [ch.clone() for ch in self.children],
+                    fixers_applied=self.fixers_applied)
+
+    def post_order(self):
+        """Return a post-order iterator for the tree."""
+        for child in self.children:
+            for el in child.post_order():
+                yield el
+        yield self
+
+    def pre_order(self):
+        """Return a pre-order iterator for the tree."""
+        yield self
+        for child in self.children:
+            for el in child.post_order():
+                yield el
+
+    def _prefix_getter(self):
+        """
+        The whitespace and comments preceding this node in the input.
+        """
+        if not self.children:
+            return ""
+        return self.children[0].prefix
+
+    def _prefix_setter(self, prefix):
+        if self.children:
+            self.children[0].prefix = prefix
+
+    prefix = property(_prefix_getter, _prefix_setter)
+
+    def set_child(self, i, child):
+        """
+        Equivalent to 'node.children[i] = child'. This method also sets the
+        child's parent attribute appropriately.
+        """
+        child.parent = self
+        self.children[i].parent = None
+        self.children[i] = child
+        self.changed()
+
+    def insert_child(self, i, child):
+        """
+        Equivalent to 'node.children.insert(i, child)'. This method also sets
+        the child's parent attribute appropriately.
+        """
+        child.parent = self
+        self.children.insert(i, child)
+        self.changed()
+
+    def append_child(self, child):
+        """
+        Equivalent to 'node.children.append(child)'. This method also sets the
+        child's parent attribute appropriately.
+        """
+        child.parent = self
+        self.children.append(child)
+        self.changed()
+
+
+class Leaf(Base):
+
+    """Concrete implementation for leaf nodes."""
+
+    # Default values for instance variables
+    _prefix = ""  # Whitespace and comments preceding this token in the input
+    lineno = 0    # Line where this token starts in the input
+    column = 0    # Column where this token tarts in the input
+
+    def __init__(self, type, value,
+                 context=None,
+                 prefix=None,
+                 fixers_applied=[]):
+        """
+        Initializer.
+
+        Takes a type constant (a token number < 256), a string value, and an
+        optional context keyword argument.
+        """
+        assert 0 <= type < 256, type
+        if context is not None:
+            self._prefix, (self.lineno, self.column) = context
+        self.type = type
+        self.value = value
+        if prefix is not None:
+            self._prefix = prefix
+        self.fixers_applied = fixers_applied[:]
+
+    def __repr__(self):
+        """Return a canonical string representation."""
+        return "%s(%r, %r)" % (self.__class__.__name__,
+                               self.type,
+                               self.value)
+
+    def __unicode__(self):
+        """
+        Return a pretty string representation.
+
+        This reproduces the input source exactly.
+        """
+        return self.prefix + str(self.value)
+
+    if sys.version_info > (3, 0):
+        __str__ = __unicode__
+
+    def _eq(self, other):
+        """Compare two nodes for equality."""
+        return (self.type, self.value) == (other.type, other.value)
+
+    def clone(self):
+        """Return a cloned (deep) copy of self."""
+        return Leaf(self.type, self.value,
+                    (self.prefix, (self.lineno, self.column)),
+                    fixers_applied=self.fixers_applied)
+
+    def leaves(self):
+        yield self
+
+    def post_order(self):
+        """Return a post-order iterator for the tree."""
+        yield self
+
+    def pre_order(self):
+        """Return a pre-order iterator for the tree."""
+        yield self
+
+    def _prefix_getter(self):
+        """
+        The whitespace and comments preceding this token in the input.
+        """
+        return self._prefix
+
+    def _prefix_setter(self, prefix):
+        self.changed()
+        self._prefix = prefix
+
+    prefix = property(_prefix_getter, _prefix_setter)
+
+
+def convert(gr, raw_node):
+    """
+    Convert raw node information to a Node or Leaf instance.
+
+    This is passed to the parser driver which calls it whenever a reduction of a
+    grammar rule produces a new complete node, so that the tree is build
+    strictly bottom-up.
+    """
+    #import pdb; pdb.set_trace()
+    print(raw_node)
+    type, value, context, children = raw_node
+    if children or type in gr.number2symbol:
+        # If there's exactly one child, return that child instead of
+        # creating a new node.
+        if len(children) == 1:
+            return children[0]
+        return Node(type, children, context=context)
+    else:
+        return Leaf(type, value, context=context)
--- a/test/test_new_parser.py
+++ b/test/test_new_parser.py
@@ -0,0 +1,27 @@
+import logging
+
+from jedi.parser import pytree
+from jedi.parser.pgen2 import Driver
+
+
+def test_basic():
+    #if self.options["print_function"]:
+    #    python_grammar = pygram.python_grammar_no_print_statement
+    #else:
+    # When this is True, the refactor*() methods will call write_file() for
+    # files processed even if they were not changed during refactoring. If
+    # and only if the refactor method's write parameter was True.
+    logger = logging.getLogger("RefactoringTool")
+    d = Driver(pytree.python_grammar, convert=pytree.convert, logger=logger)
+
+    print(d)
+    tree = d.parse_string('wblabla* 1\t\n')
+    print(repr(tree))
+    print(tree)
+    #import pdb; pdb.set_trace()
+    print(repr(d.parse_string('def x(): pass\n')))
+    print()
+    x = d.parse_string('\na #pass\n')
+    print(repr(x))
+    print(x)
+    assert False