jedi-fork/pyfuzzyparser.py

""""
TODO This is a parser

scope
    imports
    subscopes
    statements

Ignored simple statements:
 - print (no use for it)
 - assert
 - break, continue (because we avoid loops)
 - del (also no used, since this script avoids loops and files)
 - exec (dangerous - not controllable)

global is a special case and will not be used here
"""
import sys
import tokenize
import cStringIO
import token


def indent_block(text, indention="    "):
    """ This function indents a text block with a default of four spaces """
    temp = ''
    while text and text[-1] == '\n':
        temp += text[-1]
        text = text[:-1]
    lines = text.split('\n')
    return '\n'.join(map(lambda s: indention + s, lines)) + temp


class Scope(object):
    def __init__(self, name, indent, line_nr, docstr=''):
        self.subscopes = []
        self.locals = []
        self.imports = []
        self.statements = []
        self.docstr = docstr
        self.parent = None
        self.name = name
        self.indent = indent
        self.line_nr = line_nr

    def add_scope(self, sub):
        if sub == None:
            print 'push scope: [%s@%s]' % (sub.name, sub.indent)
        sub.parent = self
        self.subscopes.append(sub)
        return sub

    def doc(self, str):
        """ Clean up a docstring """
        d = str.replace('\n', ' ')
        d = d.replace('\t', ' ')
        while d.find('  ') > -1:
            d = d.replace('  ', ' ')
        while d[0] in '"\'\t ':
            d = d[1:]
        while d[-1] in '"\'\t ':
            d = d[:-1]
        dbg("Scope(%s)::docstr = %s" % (self, d))
        self.docstr = d

    def add_local(self, loc):
        self.locals.append(loc)

    def add_import(self, imp):
        self.imports.append(imp)

    def _checkexisting(self, test):
        "Convienance function... keep out duplicates"
        if test.find('=') > -1:
            var = test.split('=')[0].strip()
            for l in self.locals:
                if l.find('=') > -1 and var == l.split('=')[0].strip():
                    self.locals.remove(l)

    def get_code(self, first_indent=False, indention="    "):
        """ Returns the code of the current scope. """
        string = ""
        if len(self.docstr) > 0:
            string += '"""' + self.docstr + '"""\n'
        for i in self.imports:
            string += i.get_code() + '\n'
        for sub in self.subscopes:
            string += str(sub.line_nr) + sub.get_code(first_indent=True, indention=indention)
        for l in self.locals:
            string += l + '\n'

        if first_indent:
            string = indent_block(string, indention=indention)
        return string

    def is_empty(self):
        """
        this function returns true if there are no subscopes, imports, locals.
        """
        return not (self.locals or self.imports or self.subscopes)


class Class(Scope):
    def __init__(self, name, supers, indent, line_nr, docstr=''):
        super(Class, self).__init__(name, indent, line_nr, docstr)
        self.supers = supers

    def get_code(self, first_indent=False, indention="    "):
        str = 'class %s' % (self.name)
        if len(self.supers) > 0:
            str += '(%s)' % ','.join(self.supers)
        str += ':\n'
        str += super(Class, self).get_code(True, indention)
        if self.is_empty():
            str += "pass\n"
        return str

class Flow(Scope):
    """
    Used to describe programming structure - flow statements,
    which indent code, but are not classes or functions:
    - for
    - while
    - if
    - try
    - with
    Therefore statements like else, except and finally are also here,
    they are now saved in the root flow elements, but in the next variable.
    """
    def __init__(self, code, functions, indent, line_nr):
        super(Flow, self).__init__(name, indent, line_nr, None)
        name = code
        self.next = None

    def get_code(self, first_indent=False, indention="    "):
        str = 'class %s' % (self.name)
        str += ':\n'
        str += super(Class, self).get_code(True, indention)
        print "get_code class %s %i" % (self.name, self.is_empty())
        if self.is_empty():
            str += "pass\n"
        return str

class Function(Scope):
    def __init__(self, name, params, indent, line_nr, docstr=''):
        Scope.__init__(self, name, indent, line_nr, docstr)
        self.params = params

    def get_code(self, first_indent=False, indention="    "):
        str = "def %s(%s):\n" % (self.name, ','.join(self.params))
        #if len(self.docstr) > 0:
        #    str += self.childindent()+'"""'+self.docstr+'"""\n'
        str += super(Function, self).get_code(True, indention)
        if self.is_empty():
            str += "pass\n"
        #print "func", self.locals
        return str


class Import(object):
    """
    stores the imports of any scopes.

    >>> 1+1
    2

    :param line_nr: Line number.
    :type line_nr: int
    :param namespace: the import, as an array list, e.g. ['datetime', 'time']
    :type namespace: list
    :param alias: the alias (valid in the current namespace).
    :param from_ns: from declaration in an import.
    :param star: if a star is used -> from time import *.

    :returns: test
    :raises:

    TODO check star?
    """
    def __init__(self, line_nr, namespace, alias='', from_ns='', star=False):
        self.line_nr = line_nr
        self.namespace = namespace
        self.alias = alias
        self.from_ns = from_ns
        self.star = star

    def get_code(self):
        ns = ".".join(self.namespace)
        if self.alias:
            ns_str = "%s as %s" % (ns, self.alias)
        else:
            ns_str = ns
        if self.from_ns:
            if self.star:
                ns_str = '*'
            return "from %s import %s" % (self.from_ns, ns_str)
        else:
            return "import " + ns_str


class Statement(object):
    """
    This is the class for Local and Functions
    :param code:
    :param locals:
    """
    def __init__(self, code, locals, functions):
        self.code = code
        self.locals = locals
        self.functions = functions

    def get_code(self):
        raise NotImplementedError()


class Local(object):
    """
    stores locals variables of any scopes
    """
    def __init__(self, line_nr, left, right=None, is_global=False):
        """
        @param line_nr
        @param left: the left part of the local assignment
        @param right: the right part of the assignment, must not be set
                      (in case of global)
        @param is_global: defines a global variable
        """
        self.line_nr = line_nr
        self.left = left
        self.right = right

    def get_code(self):
        if self.alias:
            ns_str = "%s as %s" % (self.namespace, self.alias)
        else:
            ns_str = self.namespace
        if self.from_ns:
            if self.star:
                ns_str = '*'
            return "test from %s import %s" % (self.from_ns, ns_str)
        else:
            return "test import " + ns_str


class Name(object):
    """
    Used to define names in python.
    Which means the whole namespace/class/function stuff.
    So a name like "module.class.function"
    would result in an array of [module, class, function]
    """
    def __init__(self, names):
        super(Name, self).__init__()
        self.names = names

    def get_code(self):
        """ returns the name again in a full string format """
        return ".".join(names)


class PyFuzzyParser(object):
    """
    This class is used to parse a Python file, it then divides them into a
    class structure of differnt scopes.
    """
    def __init__(self):
        self.top = Scope('global', 0, 0)
        self.scope = self.top

    def _parsedotname(self, pre_used_token=None):
        """ @return (dottedname, nexttoken) """
        names = []
        if pre_used_token is None:
            tokentype, tok, indent = self.next()
            if tokentype != tokenize.NAME and tok != '*':
                return ([], tok)
        else:
            tokentype, tok, indent = pre_used_token
        names.append(tok)
        while True:
            tokentype, tok, indent = self.next()
            if tok != '.':
                break
            tokentype, tok, indent = self.next()
            if tokentype != tokenize.NAME:
                break
            names.append(tok)
        return (names, tok)


    def _parse_value_list(self, pre_used_token=None):
        """
        A value list is a comma separated list. This is used for:
        >>> for a,b,self.c in enumerate(test)
        """
        value_list = []
        if pre_used_token:
            tokentype, tok, indent = pre_used_token
            n = self._parsedotname(tok)
            if n:
                value_list.append(n)

        tokentype, tok, indent = self.next()
        while tok != 'in' and tokentype != tokenize.NEWLINE:
            n = self._parsedotname(self.current)
            if n:
                value_list.append(n)

            tokentype, tok, indent = self.next()
        return (value_list, tok)

    def _parseimportlist(self):
        imports = []
        while True:
            name, tok = self._parsedotname()
            if not name:
                break
            name2 = ''
            if tok == 'as':
                name2, tok = self._parsedotname()
            imports.append((name, name2))
            while tok != "," and "\n" not in tok:
                tokentype, tok, indent = self.next()
            if tok != ",":
                break
        return imports

    def _parseparen(self):
        name = ''
        names = []
        level = 1
        while True:
            tokentype, tok, indent = self.next()
            if tok in (')', ',') and level == 1:
                if '=' not in name:
                    name = name.replace(' ', '')
                names.append(name.strip())
                name = ''
            if tok == '(':
                level += 1
                name += "("
            elif tok == ')':
                level -= 1
                if level == 0:
                    break
                else:
                    name += ")"
            elif tok == ',' and level == 1:
                pass
            else:
                name += "%s " % str(tok)
        return names


    def _parsefunction(self, indent):
        tokentype, fname, ind = self.next()
        if tokentype != tokenize.NAME:
            return None

        tokentype, open, ind = self.next()
        if open != '(':
            return None
        params = self._parseparen()

        tokentype, colon, ind = self.next()
        if colon != ':':
            return None

        return Function(fname, params, indent, self.line_nr)


    def _parseclass(self, indent):
        tokentype, cname, ind = self.next()
        if tokentype != tokenize.NAME:
            return None

        super = []
        tokentype, next, ind = self.next()
        if next == '(':
            super = self._parseparen()
        elif next != ':':
            return None

        return Class(cname, super, indent, self.line_nr)


    def _parseassignment(self):
        assign = ''
        tokentype, tok, indent = self.next()
        if tokentype == tokenize.STRING or tok == 'str':
            return '""'
        elif tok == '(' or tok == 'tuple':
            return '()'
        elif tok == '[' or tok == 'list':
            return '[]'
        elif tok == '{' or tok == 'dict':
            return '{}'
        elif tokentype == tokenize.NUMBER:
            return '0'
        elif tok == 'open' or tok == 'file':
            return 'file'
        elif tok == 'None':
            return '_PyCmplNoType()'
        elif tok == 'type':
            return 'type(_PyCmplNoType)'  # only for method resolution
        else:
            assign += tok
            level = 0
            while True:
                tokentype, tok, indent = self.next()
                if tok in ('(', '{', '['):
                    level += 1
                elif tok in (']', '}', ')'):
                    level -= 1
                    if level == 0:
                        break
                elif level == 0:
                    if tok in (';', '\n'):
                        break
                    assign += tok
        return "%s" % assign


    def _parse_words(self, pre_used_token):
        """
        Used to parse a word, if the tokenizer returned a word at the start of
        a new command.

        :param pre_used_token: The pre parsed token.
        :type pre_used_token: set
        """
        return self._parse_statement(pre_used_token)


    def _parse_statement(self, pre_used_token = None):
        """
        Parses statements like:

        >>> a = test(b)
        >>> a += 3 - 2 or b

        and so on. One row at a time.

        :param pre_used_token: The pre parsed token.
        :type pre_used_token: set
        :return: Statement + last parsed token.
        :rtype: (Statement, str)
        """
        string = ''
        set_vars = []
        used_funcs = []
        used_vars = []

        token_type, tok, indent = pre_used_token
        while tok != '\n' and tok != ';':
            set_string = ''
            print 'parse_stmt', tok, token.tok_name[token_type]
            if token_type == tokenize.NAME:
                if tok == 'pass':
                    set_string = ''
                elif tok == 'return' or tok == 'del':
                    set_string = tok + ' '
                elif tok == 'print':
                    set_string = ''
                else:
                    path, tok = self._parsedotname(self.current)
                    if tok == '(':
                        # it must be a function
                        used_funcs.append(path)
                    else:
                        used_vars.append(path)
                    string += ".".join(path)
                    print 'parse_stmt', tok, token.tok_name[token_type]
                    if tok == '\n' or tok == ';':
                        break

            if ('=' in tok and not tok in ['>=', '<=', '==', '!=']):
                # there has been an assignement -> change vars
                set_vars = used_vars
                used_vars = []

            if set_string:
                string = set_string
            else:
                string += tok
            token_type, tok, indent = self.next()
        if not string:
            return None, tok
        print 'new_stat', string, set_vars, used_funcs, used_vars
        #return Statement(), tok

    def next(self):
        type, tok, position, dummy, self.parserline = self.gen.next()
        (self.line_nr, indent) = position
        self.current = (type, tok, indent)
        return self.current

    def parse(self, text):
        buf = cStringIO.StringIO(''.join(text) + '\n')
        self.gen = tokenize.generate_tokens(buf.readline)
        self.currentscope = self.scope

        try:
            freshscope = True
            while True:
                full_token = self.next()
                tokentype, tok, indent = full_token
                dbg('main: tok=[%s] type=[%s] indent=[%s]'\
                    % (tok, tokentype, indent))

                if tokentype == tokenize.DEDENT:
                    self.scope = self.scope.parent
                elif tok == 'def':
                    func = self._parsefunction(indent)
                    if func is None:
                        print "function: syntax error..."
                        continue
                    dbg("new scope: function %s" % (func.name))
                    freshscope = True
                    self.scope = self.scope.add_scope(func)
                elif tok == 'class':
                    cls = self._parseclass(indent)
                    if cls is None:
                        print "class: syntax error..."
                        continue
                    freshscope = True
                    dbg("new scope: class %s" % (cls.name))
                    self.scope = self.scope.add_scope(cls)
                # import stuff
                elif tok == 'import':
                    imports = self._parseimportlist()
                    for mod, alias in imports:
                        self.scope.add_import(Import(self.line_nr, mod, alias))
                    freshscope = False
                elif tok == 'from':
                    mod, tok = self._parsedotname()
                    if not mod or tok != "import":
                        print "from: syntax error..."
                        continue
                    names = self._parseimportlist()
                    for name, alias in names:
                        i = Import(self.line_nr, name, alias, mod)
                        self.scope.add_import(i)
                    freshscope = False
                #loops
                elif tok == 'for':
                    print tok, tokentype
                    value_list, tok = self._parse_value_list()
                    if tok == 'in':
                        statement, tok = self._parse_statement()
                        if tok == ':':
                            self.scope.append(statement)

                elif tok == 'while':
                    param_list = self._parse_while_loop()
                elif tok == 'global':
                    self._parse_words(full_token)
                elif tokentype == tokenize.STRING:
                    if freshscope:
                        self.scope.doc(tok)
                elif tokentype == tokenize.NAME:
                    self._parse_words(full_token)
                    """
                    name, tok = self._parsedotname(tok)
                    if tok == '=':
                        stmt = self._parseassignment()
                        dbg("parseassignment: %s = %s" % (name, stmt))
                        if stmt != None:
                            self.scope.add_local("%s = %s" % (name, stmt))
                    else:
                        #print "_not_implemented_", tok, self.parserline
                        pass
                    """
                    freshscope = False
                #else:
                    #print "_not_implemented_", tok, self.parserline
        except StopIteration:  # thrown on EOF
            pass
        #except:
        #    dbg("parse error: %s, %s @ %s" %
        #        (sys.exc_info()[0], sys.exc_info()[1], self.parserline))
        return self.top


def _sanitize(str):
    val = ''
    level = 0
    for c in str:
        if c in ('(', '{', '['):
            level += 1
        elif c in (']', '}', ')'):
            level -= 1
        elif level == 0:
            val += c
    return val


def dbg(*args):
    #print args
    pass