diff --git a/parso/__init__.py b/parso/__init__.py new file mode 100644 index 0000000..ec69501 --- /dev/null +++ b/parso/__init__.py @@ -0,0 +1,8 @@ +from parso.parser import ParserSyntaxError +from parso.pgen2.pgen import generate_grammar +from parso import python + + +def parse(grammar, code): + raise NotImplementedError + Parser(grammar, code) diff --git a/parso/cache.py b/parso/cache.py new file mode 100644 index 0000000..182a8a4 --- /dev/null +++ b/parso/cache.py @@ -0,0 +1,147 @@ +import time +import os +import sys +import hashlib +import gc +import shutil +import pickle +import platform +import errno + +from jedi import settings +from jedi import debug +from jedi._compatibility import FileNotFoundError + + +_PICKLE_VERSION = 30 +""" +Version number (integer) for file system cache. + +Increment this number when there are any incompatible changes in +the parser tree classes. For example, the following changes +are regarded as incompatible. + +- A class name is changed. +- A class is moved to another module. +- A __slot__ of a class is changed. +""" + +_VERSION_TAG = '%s-%s%s-%s' % ( + platform.python_implementation(), + sys.version_info[0], + sys.version_info[1], + _PICKLE_VERSION +) +""" +Short name for distinguish Python implementations and versions. + +It's like `sys.implementation.cache_tag` but for Python < 3.3 +we generate something similar. See: +http://docs.python.org/3/library/sys.html#sys.implementation +""" + +# for fast_parser, should not be deleted +parser_cache = {} + + + +class _NodeCacheItem(object): + def __init__(self, node, lines, change_time=None): + self.node = node + self.lines = lines + if change_time is None: + change_time = time.time() + self.change_time = change_time + + +def load_module(grammar, path): + """ + Returns a module or None, if it fails. + """ + try: + p_time = os.path.getmtime(path) + except FileNotFoundError: + return None + + try: + # TODO Add grammar sha256 + module_cache_item = parser_cache[path] + if p_time <= module_cache_item.change_time: + return module_cache_item.node + except KeyError: + if not settings.use_filesystem_cache: + return None + + return _load_from_file_system(grammar, path, p_time) + + +def _load_from_file_system(grammar, path, p_time): + cache_path = _get_hashed_path(grammar, path) + try: + try: + if p_time > os.path.getmtime(cache_path): + # Cache is outdated + return None + except OSError as e: + if e.errno == errno.ENOENT: + # In Python 2 instead of an IOError here we get an OSError. + raise FileNotFoundError + else: + raise + + with open(cache_path, 'rb') as f: + gc.disable() + try: + module_cache_item = pickle.load(f) + finally: + gc.enable() + except FileNotFoundError: + return None + else: + parser_cache[path] = module_cache_item + debug.dbg('pickle loaded: %s', path) + return module_cache_item.node + + +def save_module(grammar, path, module, lines, pickling=True): + try: + p_time = None if path is None else os.path.getmtime(path) + except OSError: + p_time = None + pickling = False + + item = _NodeCacheItem(module, lines, p_time) + parser_cache[path] = item + if settings.use_filesystem_cache and pickling and path is not None: + _save_to_file_system(grammar, path, item) + + +def _save_to_file_system(grammar, path, item): + with open(_get_hashed_path(grammar, path), 'wb') as f: + pickle.dump(item, f, pickle.HIGHEST_PROTOCOL) + + +def remove_old_modules(self): + """ + # TODO Might want to use such a function to clean up the cache (if it's + # too old). We could potentially also scan for old files in the + # directory and delete those. + """ + + +def clear_cache(self): + shutil.rmtree(settings.cache_directory) + parser_cache.clear() + + +def _get_hashed_path(grammar, path): + file_hash = hashlib.sha256(path.encode("utf-8")).hexdigest() + directory = _get_cache_directory_path() + return os.path.join(directory, '%s-%s.pkl' % (grammar.sha256, file_hash)) + + +def _get_cache_directory_path(): + directory = os.path.join(settings.cache_directory, _VERSION_TAG) + if not os.path.exists(directory): + os.makedirs(directory) + return directory diff --git a/parso/parser.py b/parso/parser.py new file mode 100644 index 0000000..7edf88c --- /dev/null +++ b/parso/parser.py @@ -0,0 +1,77 @@ +""" +The ``Parser`` tries to convert the available Python code in an easy to read +format, something like an abstract syntax tree. The classes who represent this +tree, are sitting in the :mod:`jedi.parser.tree` module. + +The Python module ``tokenize`` is a very important part in the ``Parser``, +because it splits the code into different words (tokens). Sometimes it looks a +bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast`` +module for this? Well, ``ast`` does a very good job understanding proper Python +code, but fails to work as soon as there's a single line of broken code. + +There's one important optimization that needs to be known: Statements are not +being parsed completely. ``Statement`` is just a representation of the tokens +within the statement. This lowers memory usage and cpu time and reduces the +complexity of the ``Parser`` (there's another parser sitting inside +``Statement``, which produces ``Array`` and ``Call``). +""" +from parso import tree +from parso.pgen2.parse import PgenParser + + +class ParserSyntaxError(Exception): + """ + Contains error information about the parser tree. + + May be raised as an exception. + """ + def __init__(self, message, position): + self.message = message + self.position = position + + +class BaseParser(object): + node_map = {} + default_node = tree.Node + + leaf_map = { + } + default_leaf = tree.Leaf + + def __init__(self, grammar, start_symbol='file_input', error_recovery=False): + self._grammar = grammar + self._start_symbol = start_symbol + self._error_recovery = error_recovery + + def parse(self, tokens): + start_number = self._grammar.symbol2number[self._start_symbol] + self.pgen_parser = PgenParser( + self._grammar, self.convert_node, self.convert_leaf, + self.error_recovery, start_number + ) + + node = self.pgen_parser.parse(tokens) + # The stack is empty now, we don't need it anymore. + del self.pgen_parser + return node + + def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix, + add_token_callback): + if self._error_recovery: + raise NotImplementedError("Error Recovery is not implemented") + else: + raise ParserSyntaxError('SyntaxError: invalid syntax', start_pos) + + def convert_node(self, grammar, type_, children): + # TODO REMOVE symbol, we don't want type here. + symbol = grammar.number2symbol[type_] + try: + return self.node_map[symbol](children) + except KeyError: + return self.default_node(symbol, children) + + def convert_leaf(self, grammar, type_, value, prefix, start_pos): + try: + return self.leaf_map[type_](value, start_pos, prefix) + except KeyError: + return self.default_leaf(value, start_pos, prefix) diff --git a/parso/pgen2/__init__.py b/parso/pgen2/__init__.py new file mode 100644 index 0000000..1ddae5f --- /dev/null +++ b/parso/pgen2/__init__.py @@ -0,0 +1,8 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Modifications: +# Copyright 2006 Google, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. +# Copyright 2014 David Halter. Integration into Jedi. +# Modifications are dual-licensed: MIT and PSF. diff --git a/parso/pgen2/grammar.py b/parso/pgen2/grammar.py new file mode 100644 index 0000000..44214f9 --- /dev/null +++ b/parso/pgen2/grammar.py @@ -0,0 +1,127 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Modifications: +# Copyright 2014 David Halter. Integration into Jedi. +# Modifications are dual-licensed: MIT and PSF. + +"""This module defines the data structures used to represent a grammar. + +These are a bit arcane because they are derived from the data +structures used by Python's 'pgen' parser generator. + +There's also a table here mapping operators to their names in the +token module; the Python tokenize module reports all operators as the +fallback token code OP, but the parser needs the actual token code. + +""" + +import pickle +import hashlib + + + +class Grammar(object): + """Pgen parsing tables conversion class. + + Once initialized, this class supplies the grammar tables for the + parsing engine implemented by parse.py. The parsing engine + accesses the instance variables directly. The class here does not + provide initialization of the tables; several subclasses exist to + do this (see the conv and pgen modules). + + The load() method reads the tables from a pickle file, which is + much faster than the other ways offered by subclasses. The pickle + file is written by calling dump() (after loading the grammar + tables using a subclass). The report() method prints a readable + representation of the tables to stdout, for debugging. + + The instance variables are as follows: + + symbol2number -- a dict mapping symbol names to numbers. Symbol + numbers are always 256 or higher, to distinguish + them from token numbers, which are between 0 and + 255 (inclusive). + + number2symbol -- a dict mapping numbers to symbol names; + these two are each other's inverse. + + states -- a list of DFAs, where each DFA is a list of + states, each state is a list of arcs, and each + arc is a (i, j) pair where i is a label and j is + a state number. The DFA number is the index into + this list. (This name is slightly confusing.) + Final states are represented by a special arc of + the form (0, j) where j is its own state number. + + dfas -- a dict mapping symbol numbers to (DFA, first) + pairs, where DFA is an item from the states list + above, and first is a set of tokens that can + begin this grammar rule (represented by a dict + whose values are always 1). + + labels -- a list of (x, y) pairs where x is either a token + number or a symbol number, and y is either None + or a string; the strings are keywords. The label + number is the index in this list; label numbers + are used to mark state transitions (arcs) in the + DFAs. + + start -- the number of the grammar's start symbol. + + keywords -- a dict mapping keyword strings to arc labels. + + tokens -- a dict mapping token numbers to arc labels. + + """ + + def __init__(self, bnf_text): + self.symbol2number = {} + self.number2symbol = {} + self.states = [] + self.dfas = {} + self.labels = [(0, "EMPTY")] + self.keywords = {} + self.tokens = {} + self.symbol2label = {} + self.start = 256 + self.sha256 = hashlib.sha256(bnf_text.encode("utf-8")).hexdigest() + + def dump(self, filename): + """Dump the grammar tables to a pickle file.""" + with open(filename, "wb") as f: + pickle.dump(self.__dict__, f, 2) + + def load(self, filename): + """Load the grammar tables from a pickle file.""" + with open(filename, "rb") as f: + d = pickle.load(f) + self.__dict__.update(d) + + def copy(self): + """ + Copy the grammar. + """ + new = self.__class__() + for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", + "tokens", "symbol2label"): + setattr(new, dict_attr, getattr(self, dict_attr).copy()) + new.labels = self.labels[:] + new.states = self.states[:] + new.start = self.start + return new + + def report(self): + """Dump the grammar tables to standard output, for debugging.""" + from pprint import pprint + print("s2n") + pprint(self.symbol2number) + print("n2s") + pprint(self.number2symbol) + print("states") + pprint(self.states) + print("dfas") + pprint(self.dfas) + print("labels") + pprint(self.labels) + print("start", self.start) diff --git a/parso/pgen2/parse.py b/parso/pgen2/parse.py new file mode 100644 index 0000000..a90c6f7 --- /dev/null +++ b/parso/pgen2/parse.py @@ -0,0 +1,217 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Modifications: +# Copyright 2014 David Halter. Integration into Jedi. +# Modifications are dual-licensed: MIT and PSF. + +""" +Parser engine for the grammar tables generated by pgen. + +The grammar table must be loaded first. + +See Parser/parser.c in the Python distribution for additional info on +how this parsing engine works. +""" + +from parso import tokenize + + +class InternalParseError(Exception): + """ + Exception to signal the parser is stuck and error recovery didn't help. + Basically this shouldn't happen. It's a sign that something is really + wrong. + """ + + def __init__(self, msg, type, value, start_pos): + Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" % + (msg, tokenize.tok_name[type], value, start_pos)) + self.msg = msg + self.type = type + self.value = value + self.start_pos = start_pos + + +def token_to_ilabel(grammar, type_, value): + # Map from token to label + if type_ == tokenize.NAME: + # Check for reserved words (keywords) + try: + return grammar.keywords[value] + except KeyError: + pass + + try: + return grammar.tokens[type_] + except KeyError: + return None + + +class PgenParser(object): + """Parser engine. + + The proper usage sequence is: + + p = Parser(grammar, [converter]) # create instance + p.setup([start]) # prepare for parsing + : + if p.addtoken(...): # parse a token + break + root = p.rootnode # root of abstract syntax tree + + A Parser instance may be reused by calling setup() repeatedly. + + A Parser instance contains state pertaining to the current token + sequence, and should not be used concurrently by different threads + to parse separate token sequences. + + See driver.py for how to get input tokens by tokenizing a file or + string. + + Parsing is complete when addtoken() returns True; the root of the + abstract syntax tree can then be retrieved from the rootnode + instance variable. When a syntax error occurs, error_recovery() + is called. There is no error recovery; the parser cannot be used + after a syntax error was reported (but it can be reinitialized by + calling setup()). + + """ + + def __init__(self, grammar, convert_node, convert_leaf, error_recovery, start): + """Constructor. + + The grammar argument is a grammar.Grammar instance; see the + grammar module for more information. + + The parser is not ready yet for parsing; you must call the + setup() method to get it started. + + The optional convert argument is a function mapping concrete + syntax tree nodes to abstract syntax tree nodes. If not + given, no conversion is done and the syntax tree produced is + the concrete syntax tree. If given, it must be a function of + two arguments, the first being the grammar (a grammar.Grammar + instance), and the second being the concrete syntax tree node + to be converted. The syntax tree is converted from the bottom + up. + + A concrete syntax tree node is a (type, nodes) tuple, where + type is the node type (a token or symbol number) and nodes + is a list of children for symbols, and None for tokens. + + An abstract syntax tree node may be anything; this is entirely + up to the converter function. + + """ + self.grammar = grammar + self.convert_node = convert_node + self.convert_leaf = convert_leaf + + # Each stack entry is a tuple: (dfa, state, node). + # A node is a tuple: (type, children), + # where children is a list of nodes or None + newnode = (start, []) + stackentry = (self.grammar.dfas[start], 0, newnode) + self.stack = [stackentry] + self.rootnode = None + self.error_recovery = error_recovery + + def parse(self, tokens): + for type_, value, start_pos, prefix in tokens: + if self.addtoken(type_, value, start_pos, prefix): + break + else: + # We never broke out -- EOF is too soon -- Unfinished statement. + # However, the error recovery might have added the token again, if + # the stack is empty, we're fine. + if self.stack: + raise InternalParseError("incomplete input", type_, value, start_pos) + return self.rootnode + + def addtoken(self, type_, value, start_pos, prefix): + """Add a token; return True if this is the end of the program.""" + ilabel = token_to_ilabel(self.grammar, type_, value) + + # Loop until the token is shifted; may raise exceptions + _gram = self.grammar + _labels = _gram.labels + _push = self._push + _pop = self._pop + _shift = self._shift + while True: + dfa, state, node = self.stack[-1] + states, first = dfa + arcs = states[state] + # Look for a state with this label + for i, newstate in arcs: + t, v = _labels[i] + if ilabel == i: + # Look it up in the list of labels + assert t < 256 + # Shift a token; we're done with it + _shift(type_, value, newstate, prefix, start_pos) + # Pop while we are in an accept-only state + state = newstate + while states[state] == [(0, state)]: + _pop() + if not self.stack: + # Done parsing! + return True + dfa, state, node = self.stack[-1] + states, first = dfa + # Done with this token + return False + elif t >= 256: + # See if it's a symbol and if we're in its first set + itsdfa = _gram.dfas[t] + itsstates, itsfirst = itsdfa + if ilabel in itsfirst: + # Push a symbol + _push(t, itsdfa, newstate) + break # To continue the outer while loop + else: + if (0, state) in arcs: + # An accepting state, pop it and try something else + _pop() + if not self.stack: + # Done parsing, but another token is input + raise InternalParseError("too much input", type_, value, start_pos) + else: + self.error_recovery(self.grammar, self.stack, arcs, type_, + value, start_pos, prefix, self.addtoken) + break + + def _shift(self, type_, value, newstate, prefix, start_pos): + """Shift a token. (Internal)""" + dfa, state, node = self.stack[-1] + newnode = self.convert_leaf(self.grammar, type_, value, prefix, start_pos) + node[-1].append(newnode) + self.stack[-1] = (dfa, newstate, node) + + def _push(self, type_, newdfa, newstate): + """Push a nonterminal. (Internal)""" + dfa, state, node = self.stack[-1] + newnode = (type_, []) + self.stack[-1] = (dfa, newstate, node) + self.stack.append((newdfa, 0, newnode)) + + def _pop(self): + """Pop a nonterminal. (Internal)""" + popdfa, popstate, (type_, children) = self.stack.pop() + # If there's exactly one child, return that child instead of creating a + # new node. We still create expr_stmt and file_input though, because a + # lot of Jedi depends on its logic. + if len(children) == 1: + newnode = children[0] + else: + newnode = self.convert_node(self.grammar, type_, children) + + try: + # Equal to: + # dfa, state, node = self.stack[-1] + # symbol, children = node + self.stack[-1][2][1].append(newnode) + except IndexError: + # Stack is empty, set the rootnode. + self.rootnode = newnode diff --git a/parso/pgen2/pgen.py b/parso/pgen2/pgen.py new file mode 100644 index 0000000..96412bc --- /dev/null +++ b/parso/pgen2/pgen.py @@ -0,0 +1,394 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Modifications: +# Copyright 2014 David Halter. Integration into Jedi. +# Modifications are dual-licensed: MIT and PSF. + +from parso.pgen2 import grammar +from parso import token +from parso import tokenize + + +class ParserGenerator(object): + def __init__(self, bnf_text): + self._bnf_text = bnf_text + self.generator = tokenize.source_tokens(bnf_text) + self.gettoken() # Initialize lookahead + self.dfas, self.startsymbol = self.parse() + self.first = {} # map from symbol name to set of tokens + self.addfirstsets() + + def make_grammar(self): + c = grammar.Grammar(self._bnf_text) + names = list(self.dfas.keys()) + names.sort() + names.remove(self.startsymbol) + names.insert(0, self.startsymbol) + for name in names: + i = 256 + len(c.symbol2number) + c.symbol2number[name] = i + c.number2symbol[i] = name + for name in names: + dfa = self.dfas[name] + states = [] + for state in dfa: + arcs = [] + for label, next in state.arcs.items(): + arcs.append((self.make_label(c, label), dfa.index(next))) + if state.isfinal: + arcs.append((0, dfa.index(state))) + states.append(arcs) + c.states.append(states) + c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) + c.start = c.symbol2number[self.startsymbol] + return c + + def make_first(self, c, name): + rawfirst = self.first[name] + first = {} + for label in rawfirst: + ilabel = self.make_label(c, label) + ##assert ilabel not in first # XXX failed on <> ... != + first[ilabel] = 1 + return first + + def make_label(self, c, label): + # XXX Maybe this should be a method on a subclass of converter? + ilabel = len(c.labels) + if label[0].isalpha(): + # Either a symbol name or a named token + if label in c.symbol2number: + # A symbol name (a non-terminal) + if label in c.symbol2label: + return c.symbol2label[label] + else: + c.labels.append((c.symbol2number[label], None)) + c.symbol2label[label] = ilabel + return ilabel + else: + # A named token (NAME, NUMBER, STRING) + itoken = getattr(token, label, None) + assert isinstance(itoken, int), label + assert itoken in token.tok_name, label + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + else: + # Either a keyword or an operator + assert label[0] in ('"', "'"), label + value = eval(label) + if value[0].isalpha(): + # A keyword + if value in c.keywords: + return c.keywords[value] + else: + c.labels.append((token.NAME, value)) + c.keywords[value] = ilabel + return ilabel + else: + # An operator (any non-numeric token) + itoken = token.opmap[value] # Fails if unknown token + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + + def addfirstsets(self): + names = list(self.dfas.keys()) + names.sort() + for name in names: + if name not in self.first: + self.calcfirst(name) + #print name, self.first[name].keys() + + def calcfirst(self, name): + dfa = self.dfas[name] + self.first[name] = None # dummy to detect left recursion + state = dfa[0] + totalset = {} + overlapcheck = {} + for label, next in state.arcs.items(): + if label in self.dfas: + if label in self.first: + fset = self.first[label] + if fset is None: + raise ValueError("recursion for rule %r" % name) + else: + self.calcfirst(label) + fset = self.first[label] + totalset.update(fset) + overlapcheck[label] = fset + else: + totalset[label] = 1 + overlapcheck[label] = {label: 1} + inverse = {} + for label, itsfirst in overlapcheck.items(): + for symbol in itsfirst: + if symbol in inverse: + raise ValueError("rule %s is ambiguous; %s is in the" + " first sets of %s as well as %s" % + (name, symbol, label, inverse[symbol])) + inverse[symbol] = label + self.first[name] = totalset + + def parse(self): + dfas = {} + startsymbol = None + # MSTART: (NEWLINE | RULE)* ENDMARKER + while self.type != token.ENDMARKER: + while self.type == token.NEWLINE: + self.gettoken() + # RULE: NAME ':' RHS NEWLINE + name = self.expect(token.NAME) + self.expect(token.OP, ":") + a, z = self.parse_rhs() + self.expect(token.NEWLINE) + #self.dump_nfa(name, a, z) + dfa = self.make_dfa(a, z) + #self.dump_dfa(name, dfa) + # oldlen = len(dfa) + self.simplify_dfa(dfa) + # newlen = len(dfa) + dfas[name] = dfa + #print name, oldlen, newlen + if startsymbol is None: + startsymbol = name + return dfas, startsymbol + + def make_dfa(self, start, finish): + # To turn an NFA into a DFA, we define the states of the DFA + # to correspond to *sets* of states of the NFA. Then do some + # state reduction. Let's represent sets as dicts with 1 for + # values. + assert isinstance(start, NFAState) + assert isinstance(finish, NFAState) + + def closure(state): + base = {} + addclosure(state, base) + return base + + def addclosure(state, base): + assert isinstance(state, NFAState) + if state in base: + return + base[state] = 1 + for label, next in state.arcs: + if label is None: + addclosure(next, base) + + states = [DFAState(closure(start), finish)] + for state in states: # NB states grows while we're iterating + arcs = {} + for nfastate in state.nfaset: + for label, next in nfastate.arcs: + if label is not None: + addclosure(next, arcs.setdefault(label, {})) + for label, nfaset in arcs.items(): + for st in states: + if st.nfaset == nfaset: + break + else: + st = DFAState(nfaset, finish) + states.append(st) + state.addarc(st, label) + return states # List of DFAState instances; first one is start + + def dump_nfa(self, name, start, finish): + print("Dump of NFA for", name) + todo = [start] + for i, state in enumerate(todo): + print(" State", i, state is finish and "(final)" or "") + for label, next in state.arcs: + if next in todo: + j = todo.index(next) + else: + j = len(todo) + todo.append(next) + if label is None: + print(" -> %d" % j) + else: + print(" %s -> %d" % (label, j)) + + def dump_dfa(self, name, dfa): + print("Dump of DFA for", name) + for i, state in enumerate(dfa): + print(" State", i, state.isfinal and "(final)" or "") + for label, next in state.arcs.items(): + print(" %s -> %d" % (label, dfa.index(next))) + + def simplify_dfa(self, dfa): + # This is not theoretically optimal, but works well enough. + # Algorithm: repeatedly look for two states that have the same + # set of arcs (same labels pointing to the same nodes) and + # unify them, until things stop changing. + + # dfa is a list of DFAState instances + changes = True + while changes: + changes = False + for i, state_i in enumerate(dfa): + for j in range(i + 1, len(dfa)): + state_j = dfa[j] + if state_i == state_j: + #print " unify", i, j + del dfa[j] + for state in dfa: + state.unifystate(state_j, state_i) + changes = True + break + + def parse_rhs(self): + # RHS: ALT ('|' ALT)* + a, z = self.parse_alt() + if self.value != "|": + return a, z + else: + aa = NFAState() + zz = NFAState() + aa.addarc(a) + z.addarc(zz) + while self.value == "|": + self.gettoken() + a, z = self.parse_alt() + aa.addarc(a) + z.addarc(zz) + return aa, zz + + def parse_alt(self): + # ALT: ITEM+ + a, b = self.parse_item() + while (self.value in ("(", "[") or + self.type in (token.NAME, token.STRING)): + c, d = self.parse_item() + b.addarc(c) + b = d + return a, b + + def parse_item(self): + # ITEM: '[' RHS ']' | ATOM ['+' | '*'] + if self.value == "[": + self.gettoken() + a, z = self.parse_rhs() + self.expect(token.OP, "]") + a.addarc(z) + return a, z + else: + a, z = self.parse_atom() + value = self.value + if value not in ("+", "*"): + return a, z + self.gettoken() + z.addarc(a) + if value == "+": + return a, z + else: + return a, a + + def parse_atom(self): + # ATOM: '(' RHS ')' | NAME | STRING + if self.value == "(": + self.gettoken() + a, z = self.parse_rhs() + self.expect(token.OP, ")") + return a, z + elif self.type in (token.NAME, token.STRING): + a = NFAState() + z = NFAState() + a.addarc(z, self.value) + self.gettoken() + return a, z + else: + self.raise_error("expected (...) or NAME or STRING, got %s/%s", + self.type, self.value) + + def expect(self, type, value=None): + if self.type != type or (value is not None and self.value != value): + self.raise_error("expected %s/%s, got %s/%s", + type, value, self.type, self.value) + value = self.value + self.gettoken() + return value + + def gettoken(self): + tup = next(self.generator) + while tup[0] in (token.COMMENT, token.NL): + tup = next(self.generator) + self.type, self.value, self.begin, prefix = tup + #print tokenize.tok_name[self.type], repr(self.value) + + def raise_error(self, msg, *args): + if args: + try: + msg = msg % args + except: + msg = " ".join([msg] + list(map(str, args))) + line = open(self.filename).readlines()[self.begin[0]] + raise SyntaxError(msg, (self.filename, self.begin[0], + self.begin[1], line)) + + +class NFAState(object): + def __init__(self): + self.arcs = [] # list of (label, NFAState) pairs + + def addarc(self, next, label=None): + assert label is None or isinstance(label, str) + assert isinstance(next, NFAState) + self.arcs.append((label, next)) + + +class DFAState(object): + def __init__(self, nfaset, final): + assert isinstance(nfaset, dict) + assert isinstance(next(iter(nfaset)), NFAState) + assert isinstance(final, NFAState) + self.nfaset = nfaset + self.isfinal = final in nfaset + self.arcs = {} # map from label to DFAState + + def addarc(self, next, label): + assert isinstance(label, str) + assert label not in self.arcs + assert isinstance(next, DFAState) + self.arcs[label] = next + + def unifystate(self, old, new): + for label, next in self.arcs.items(): + if next is old: + self.arcs[label] = new + + def __eq__(self, other): + # Equality test -- ignore the nfaset instance variable + assert isinstance(other, DFAState) + if self.isfinal != other.isfinal: + return False + # Can't just return self.arcs == other.arcs, because that + # would invoke this method recursively, with cycles... + if len(self.arcs) != len(other.arcs): + return False + for label, next in self.arcs.items(): + if next is not other.arcs.get(label): + return False + return True + + __hash__ = None # For Py3 compatibility. + + +def generate_grammar(bnf_text): + """ + ``bnf_text`` is a grammar in extended BNF (using * for repetition, + for + at-least-once repetition, [] for optional parts, | for alternatives and () + for grouping). + + It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its + own parser. + """ + p = ParserGenerator(bnf_text) + return p.make_grammar() diff --git a/parso/python/__init__.py b/parso/python/__init__.py new file mode 100644 index 0000000..3657dc2 --- /dev/null +++ b/parso/python/__init__.py @@ -0,0 +1,124 @@ +""" +Parsers for Python +""" +import os + +from jedi import settings +from jedi.common import splitlines, source_to_unicode +from jedi._compatibility import FileNotFoundError +from parso.pgen2.pgen import generate_grammar +from parso.python.parser import Parser, _remove_last_newline +from parso.python.diff import DiffParser +from parso.tokenize import generate_tokens +from parso.cache import parser_cache, load_module, save_module + + +_loaded_grammars = {} + + +def load_grammar(version=None): + """ + Loads a Python grammar. The default version is always the latest. + + If you need support for a specific version, please use e.g. + `version='3.3'`. + """ + if version is None: + version = '3.6' + + if version in ('3.2', '3.3'): + version = '3.4' + elif version == '2.6': + version = '2.7' + + file = 'grammar' + version + '.txt' + + global _loaded_grammars + path = os.path.join(os.path.dirname(__file__), file) + try: + return _loaded_grammars[path] + except KeyError: + try: + with open(path) as f: + bnf_text = f.read() + grammar = generate_grammar(bnf_text) + return _loaded_grammars.setdefault(path, grammar) + except FileNotFoundError: + # Just load the default if the file does not exist. + return load_grammar() + + +def parse(code=None, path=None, grammar=None, error_recovery=True, + start_symbol='file_input', cache=False, diff_cache=False): + """ + If you want to parse a Python file you want to start here, most likely. + + If you need finer grained control over the parsed instance, there will be + other ways to access it. + + :param code: A unicode string that contains Python code. + :param path: The path to the file you want to open. Only needed for caching. + :param grammar: A Python grammar file, created with load_grammar. You may + not specify it. In that case it's the current Python version. + :param error_recovery: If enabled, any code will be returned. If it is + invalid, it will be returned as an error node. If disabled, you will + get a ParseError when encountering syntax errors in your code. + :param start_symbol: The grammar symbol that you want to parse. Only + allowed to be used when error_recovery is disabled. + + :return: A syntax tree node. Typically the module. + """ + if code is None and path is None: + raise TypeError("Please provide either code or a path.") + + if grammar is None: + grammar = load_grammar() + + if cache and not code and path is not None: + # In this case we do actual caching. We just try to load it. + module_node = load_module(grammar, path) + if module_node is not None: + return module_node + + if code is None: + with open(path, 'rb') as f: + code = source_to_unicode(f.read()) + + if diff_cache and settings.fast_parser: + try: + module_cache_item = parser_cache[path] + except KeyError: + pass + else: + lines = splitlines(code, keepends=True) + module_node = module_cache_item.node + old_lines = module_cache_item.lines + if old_lines == lines: + save_module(grammar, path, module_node, lines, pickling=False) + return module_node + + new_node = DiffParser(grammar, module_node).update( + old_lines=old_lines, + new_lines=lines + ) + save_module(grammar, path, new_node, lines, pickling=cache) + return new_node + + added_newline = not code.endswith('\n') + lines = tokenize_lines = splitlines(code, keepends=True) + if added_newline: + code += '\n' + tokenize_lines = list(tokenize_lines) + tokenize_lines[-1] += '\n' + tokenize_lines.append('') + + tokens = generate_tokens(tokenize_lines, use_exact_op_types=True) + + p = Parser(grammar, error_recovery=error_recovery, start_symbol=start_symbol) + root_node = p.parse(tokens=tokens) + if added_newline: + _remove_last_newline(root_node) + + if cache or diff_cache: + save_module(grammar, path, root_node, lines, pickling=cache) + return root_node diff --git a/parso/python/diff.py b/parso/python/diff.py new file mode 100644 index 0000000..47b1982 --- /dev/null +++ b/parso/python/diff.py @@ -0,0 +1,603 @@ +""" +Basically a contains parser that is faster, because it tries to parse only +parts and if anything changes, it only reparses the changed parts. + +It works with a simple diff in the beginning and will try to reuse old parser +fragments. +""" +import re +import difflib +from collections import namedtuple + +from jedi.common import splitlines +from jedi import debug +from parso.python.parser import Parser, _remove_last_newline +from parso.python.tree import EndMarker +from parso.tokenize import (generate_tokens, NEWLINE, TokenInfo, + ENDMARKER, INDENT, DEDENT) + + +def _get_last_line(node_or_leaf): + last_leaf = node_or_leaf.get_last_leaf() + if _ends_with_newline(last_leaf): + return last_leaf.start_pos[0] + else: + return last_leaf.end_pos[0] + + +def _ends_with_newline(leaf, suffix=''): + if leaf.type == 'error_leaf': + typ = leaf.original_type + else: + typ = leaf.type + + return typ == 'newline' or suffix.endswith('\n') + + +def _flows_finished(grammar, stack): + """ + if, while, for and try might not be finished, because another part might + still be parsed. + """ + for dfa, newstate, (symbol_number, nodes) in stack: + if grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt', + 'for_stmt', 'try_stmt'): + return False + return True + + +def suite_or_file_input_is_valid(grammar, stack): + if not _flows_finished(grammar, stack): + return False + + for dfa, newstate, (symbol_number, nodes) in reversed(stack): + if grammar.number2symbol[symbol_number] == 'suite': + # If only newline is in the suite, the suite is not valid, yet. + return len(nodes) > 1 + # Not reaching a suite means that we're dealing with file_input levels + # where there's no need for a valid statement in it. It can also be empty. + return True + + +def _is_flow_node(node): + try: + value = node.children[0].value + except AttributeError: + return False + return value in ('if', 'for', 'while', 'try') + + +class _PositionUpdatingFinished(Exception): + pass + + +def _update_positions(nodes, line_offset, last_leaf): + for node in nodes: + try: + children = node.children + except AttributeError: + # Is a leaf + node.line += line_offset + if node is last_leaf: + raise _PositionUpdatingFinished + else: + _update_positions(children, line_offset, last_leaf) + + +class DiffParser(object): + """ + An advanced form of parsing a file faster. Unfortunately comes with huge + side effects. It changes the given module. + """ + def __init__(self, grammar, module): + self._grammar = grammar + self._module = module + + def _reset(self): + self._copy_count = 0 + self._parser_count = 0 + + self._nodes_stack = _NodesStack(self._module) + + def update(self, old_lines, new_lines): + ''' + The algorithm works as follows: + + Equal: + - Assure that the start is a newline, otherwise parse until we get + one. + - Copy from parsed_until_line + 1 to max(i2 + 1) + - Make sure that the indentation is correct (e.g. add DEDENT) + - Add old and change positions + Insert: + - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not + much more. + + Returns the new module node. + ''' + debug.speed('diff parser start') + # Reset the used names cache so they get regenerated. + self._module._used_names = None + + self._parser_lines_new = new_lines + self._added_newline = False + if new_lines[-1] != '': + # The Python grammar needs a newline at the end of a file, but for + # everything else we keep working with new_lines here. + self._parser_lines_new = list(new_lines) + self._parser_lines_new[-1] += '\n' + self._parser_lines_new.append('') + self._added_newline = True + + self._reset() + + line_length = len(new_lines) + sm = difflib.SequenceMatcher(None, old_lines, self._parser_lines_new) + opcodes = sm.get_opcodes() + debug.speed('diff parser calculated') + debug.dbg('diff: line_lengths old: %s, new: %s' % (len(old_lines), line_length)) + + for operation, i1, i2, j1, j2 in opcodes: + debug.dbg('diff %s old[%s:%s] new[%s:%s]', + operation, i1 + 1, i2, j1 + 1, j2) + + if j2 == line_length + int(self._added_newline): + # The empty part after the last newline is not relevant. + j2 -= 1 + + if operation == 'equal': + line_offset = j1 - i1 + self._copy_from_old_parser(line_offset, i2, j2) + elif operation == 'replace': + self._parse(until_line=j2) + elif operation == 'insert': + self._parse(until_line=j2) + else: + assert operation == 'delete' + + # With this action all change will finally be applied and we have a + # changed module. + self._nodes_stack.close() + + if self._added_newline: + _remove_last_newline(self._module) + + # Good for debugging. + if debug.debug_function: + self._enabled_debugging(old_lines, new_lines) + last_pos = self._module.end_pos[0] + if last_pos != line_length: + current_lines = splitlines(self._module.get_code(), keepends=True) + diff = difflib.unified_diff(current_lines, new_lines) + raise Exception( + "There's an issue (%s != %s) with the diff parser. Please report:\n%s" + % (last_pos, line_length, ''.join(diff)) + ) + + debug.speed('diff parser end') + return self._module + + def _enabled_debugging(self, old_lines, lines_new): + if self._module.get_code() != ''.join(lines_new): + debug.warning('parser issue:\n%s\n%s', ''.join(old_lines), + ''.join(lines_new)) + + def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new): + copied_nodes = [None] + + last_until_line = -1 + while until_line_new > self._nodes_stack.parsed_until_line: + parsed_until_line_old = self._nodes_stack.parsed_until_line - line_offset + line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1) + if line_stmt is None: + # Parse 1 line at least. We don't need more, because we just + # want to get into a state where the old parser has statements + # again that can be copied (e.g. not lines within parentheses). + self._parse(self._nodes_stack.parsed_until_line + 1) + elif not copied_nodes: + # We have copied as much as possible (but definitely not too + # much). Therefore we just parse the rest. + # We might not reach the end, because there's a statement + # that is not finished. + self._parse(until_line_new) + else: + p_children = line_stmt.parent.children + index = p_children.index(line_stmt) + + copied_nodes = self._nodes_stack.copy_nodes( + p_children[index:], + until_line_old, + line_offset + ) + # Match all the nodes that are in the wanted range. + if copied_nodes: + self._copy_count += 1 + + from_ = copied_nodes[0].get_start_pos_of_prefix()[0] + line_offset + to = self._nodes_stack.parsed_until_line + + debug.dbg('diff actually copy %s to %s', from_, to) + # Since there are potential bugs that might loop here endlessly, we + # just stop here. + assert last_until_line != self._nodes_stack.parsed_until_line \ + or not copied_nodes, last_until_line + last_until_line = self._nodes_stack.parsed_until_line + + def _get_old_line_stmt(self, old_line): + leaf = self._module.get_leaf_for_position((old_line, 0), include_prefixes=True) + + if _ends_with_newline(leaf): + leaf = leaf.get_next_leaf() + if leaf.get_start_pos_of_prefix()[0] == old_line: + node = leaf + while node.parent.type not in ('file_input', 'suite'): + node = node.parent + return node + # Must be on the same line. Otherwise we need to parse that bit. + return None + + def _get_before_insertion_node(self): + if self._nodes_stack.is_empty(): + return None + + line = self._nodes_stack.parsed_until_line + 1 + node = self._new_module.get_last_leaf() + while True: + parent = node.parent + if parent.type in ('suite', 'file_input'): + assert node.end_pos[0] <= line + assert node.end_pos[1] == 0 or '\n' in self._prefix + return node + node = parent + + def _parse(self, until_line): + """ + Parses at least until the given line, but might just parse more until a + valid state is reached. + """ + last_until_line = 0 + while until_line > self._nodes_stack.parsed_until_line: + node = self._try_parse_part(until_line) + nodes = self._get_children_nodes(node) + #self._insert_nodes(nodes) + + self._nodes_stack.add_parsed_nodes(nodes) + debug.dbg( + 'parse part %s to %s (to %s in parser)', + nodes[0].get_start_pos_of_prefix()[0], + self._nodes_stack.parsed_until_line, + node.end_pos[0] - 1 + ) + # Since the tokenizer sometimes has bugs, we cannot be sure that + # this loop terminates. Therefore assert that there's always a + # change. + assert last_until_line != self._nodes_stack.parsed_until_line, last_until_line + last_until_line = self._nodes_stack.parsed_until_line + + def _get_children_nodes(self, node): + nodes = node.children + first_element = nodes[0] + # TODO this looks very strange... + if first_element.type == 'error_leaf' and \ + first_element.original_type == 'indent': + assert False, str(nodes) + + return nodes + + def _try_parse_part(self, until_line): + """ + Sets up a normal parser that uses a spezialized tokenizer to only parse + until a certain position (or a bit longer if the statement hasn't + ended. + """ + self._parser_count += 1 + # TODO speed up, shouldn't copy the whole list all the time. + # memoryview? + parsed_until_line = self._nodes_stack.parsed_until_line + lines_after = self._parser_lines_new[parsed_until_line:] + #print('parse_content', parsed_until_line, lines_after, until_line) + tokens = self._diff_tokenize( + lines_after, + until_line, + line_offset=parsed_until_line + ) + self._active_parser = Parser( + self._grammar, + error_recovery=True + ) + return self._active_parser.parse(tokens=tokens) + + def _diff_tokenize(self, lines, until_line, line_offset=0): + is_first_token = True + omitted_first_indent = False + indents = [] + tokens = generate_tokens(lines, use_exact_op_types=True) + stack = self._active_parser.pgen_parser.stack + for typ, string, start_pos, prefix in tokens: + start_pos = start_pos[0] + line_offset, start_pos[1] + if typ == INDENT: + indents.append(start_pos[1]) + if is_first_token: + omitted_first_indent = True + # We want to get rid of indents that are only here because + # we only parse part of the file. These indents would only + # get parsed as error leafs, which doesn't make any sense. + is_first_token = False + continue + is_first_token = False + + if typ == DEDENT: + indents.pop() + if omitted_first_indent and not indents: + # We are done here, only thing that can come now is an + # endmarker or another dedented code block. + typ, string, start_pos, prefix = next(tokens) + if '\n' in prefix: + prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix) + else: + prefix = '' + yield TokenInfo(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix) + break + elif typ == NEWLINE and start_pos[0] >= until_line: + yield TokenInfo(typ, string, start_pos, prefix) + # Check if the parser is actually in a valid suite state. + if suite_or_file_input_is_valid(self._grammar, stack): + start_pos = start_pos[0] + 1, 0 + while len(indents) > int(omitted_first_indent): + indents.pop() + yield TokenInfo(DEDENT, '', start_pos, '') + + yield TokenInfo(ENDMARKER, '', start_pos, '') + break + else: + continue + + yield TokenInfo(typ, string, start_pos, prefix) + + +class _NodesStackNode(object): + ChildrenGroup = namedtuple('ChildrenGroup', 'children line_offset last_line_offset_leaf') + + def __init__(self, tree_node, parent=None): + self.tree_node = tree_node + self.children_groups = [] + self.parent = parent + + def close(self): + children = [] + for children_part, line_offset, last_line_offset_leaf in self.children_groups: + if line_offset != 0: + try: + _update_positions( + children_part, line_offset, last_line_offset_leaf) + except _PositionUpdatingFinished: + pass + children += children_part + self.tree_node.children = children + # Reset the parents + for node in children: + node.parent = self.tree_node + + def add(self, children, line_offset=0, last_line_offset_leaf=None): + group = self.ChildrenGroup(children, line_offset, last_line_offset_leaf) + self.children_groups.append(group) + + def get_last_line(self, suffix): + line = 0 + if self.children_groups: + children_group = self.children_groups[-1] + last_leaf = children_group.children[-1].get_last_leaf() + line = last_leaf.end_pos[0] + + # Calculate the line offsets + offset = children_group.line_offset + if offset: + # In case the line_offset is not applied to this specific leaf, + # just ignore it. + if last_leaf.line <= children_group.last_line_offset_leaf.line: + line += children_group.line_offset + + # Newlines end on the next line, which means that they would cover + # the next line. That line is not fully parsed at this point. + if _ends_with_newline(last_leaf, suffix): + line -= 1 + line += suffix.count('\n') + return line + + +class _NodesStack(object): + endmarker_type = 'endmarker' + + def __init__(self, module): + # Top of stack + self._tos = self._base_node = _NodesStackNode(module) + self._module = module + self._last_prefix = '' + self.prefix = '' + + def is_empty(self): + return not self._base_node.children + + @property + def parsed_until_line(self): + return self._tos.get_last_line(self.prefix) + + def _get_insertion_node(self, indentation_node): + indentation = indentation_node.start_pos[1] + + # find insertion node + node = self._tos + while True: + tree_node = node.tree_node + if tree_node.type == 'suite': + # A suite starts with NEWLINE, ... + node_indentation = tree_node.children[1].start_pos[1] + + if indentation >= node_indentation: # Not a Dedent + # We might be at the most outer layer: modules. We + # don't want to depend on the first statement + # having the right indentation. + return node + + elif tree_node.type == 'file_input': + return node + + node = self._close_tos() + + def _close_tos(self): + self._tos.close() + self._tos = self._tos.parent + return self._tos + + def add_parsed_nodes(self, tree_nodes): + tree_nodes = self._remove_endmarker(tree_nodes) + if not tree_nodes: + return + + assert tree_nodes[0].type != 'newline' + + node = self._get_insertion_node(tree_nodes[0]) + assert node.tree_node.type in ('suite', 'file_input') + node.add(tree_nodes) + self._update_tos(tree_nodes[-1]) + + def _remove_endmarker(self, tree_nodes): + """ + Helps cleaning up the tree nodes that get inserted. + """ + last_leaf = tree_nodes[-1].get_last_leaf() + is_endmarker = last_leaf.type == self.endmarker_type + self._last_prefix = '' + if is_endmarker: + try: + separation = last_leaf.prefix.rindex('\n') + except ValueError: + pass + else: + # Remove the whitespace part of the prefix after a newline. + # That is not relevant if parentheses were opened. Always parse + # until the end of a line. + last_leaf.prefix, self._last_prefix = \ + last_leaf.prefix[:separation + 1], last_leaf.prefix[separation + 1:] + + first_leaf = tree_nodes[0].get_first_leaf() + first_leaf.prefix = self.prefix + first_leaf.prefix + self.prefix = '' + + if is_endmarker: + self.prefix = last_leaf.prefix + + tree_nodes = tree_nodes[:-1] + + return tree_nodes + + def copy_nodes(self, tree_nodes, until_line, line_offset): + """ + Copies tree nodes from the old parser tree. + + Returns the number of tree nodes that were copied. + """ + tos = self._get_insertion_node(tree_nodes[0]) + + new_nodes, self._tos = self._copy_nodes(tos, tree_nodes, until_line, line_offset) + return new_nodes + + def _copy_nodes(self, tos, nodes, until_line, line_offset): + new_nodes = [] + + new_tos = tos + for node in nodes: + if node.type == 'endmarker': + # Endmarkers just distort all the checks below. Remove them. + break + + if node.start_pos[0] > until_line: + break + # TODO this check might take a bit of time for large files. We + # might want to change this to do more intelligent guessing or + # binary search. + if _get_last_line(node) > until_line: + # We can split up functions and classes later. + if node.type in ('classdef', 'funcdef') and node.children[-1].type == 'suite': + new_nodes.append(node) + break + + new_nodes.append(node) + + if not new_nodes: + return [], tos + + last_node = new_nodes[-1] + line_offset_index = -1 + if last_node.type in ('classdef', 'funcdef'): + suite = last_node.children[-1] + if suite.type == 'suite': + suite_tos = _NodesStackNode(suite) + # Don't need to pass line_offset here, it's already done by the + # parent. + suite_nodes, recursive_tos = self._copy_nodes( + suite_tos, suite.children, until_line, line_offset) + if len(suite_nodes) < 2: + # A suite only with newline is not valid. + new_nodes.pop() + else: + suite_tos.parent = tos + new_tos = recursive_tos + line_offset_index = -2 + + elif (new_nodes[-1].type in ('error_leaf', 'error_node') or + _is_flow_node(new_nodes[-1])): + # Error leafs/nodes don't have a defined start/end. Error + # nodes might not end with a newline (e.g. if there's an + # open `(`). Therefore ignore all of them unless they are + # succeeded with valid parser state. + # If we copy flows at the end, they might be continued + # after the copy limit (in the new parser). + # In this while loop we try to remove until we find a newline. + new_nodes.pop() + while new_nodes: + last_node = new_nodes[-1] + if last_node.get_last_leaf().type == 'newline': + break + new_nodes.pop() + + if new_nodes: + try: + last_line_offset_leaf = new_nodes[line_offset_index].get_last_leaf() + except IndexError: + line_offset = 0 + # In this case we don't have to calculate an offset, because + # there's no children to be managed. + last_line_offset_leaf = None + tos.add(new_nodes, line_offset, last_line_offset_leaf) + return new_nodes, new_tos + + def _update_tos(self, tree_node): + if tree_node.type in ('suite', 'file_input'): + self._tos = _NodesStackNode(tree_node, self._tos) + self._tos.add(list(tree_node.children)) + self._update_tos(tree_node.children[-1]) + elif tree_node.type in ('classdef', 'funcdef'): + self._update_tos(tree_node.children[-1]) + + def close(self): + while self._tos is not None: + self._close_tos() + + # Add an endmarker. + try: + last_leaf = self._module.get_last_leaf() + end_pos = list(last_leaf.end_pos) + except IndexError: + end_pos = [1, 0] + lines = splitlines(self.prefix) + assert len(lines) > 0 + if len(lines) == 1: + end_pos[1] += len(lines[0]) + else: + end_pos[0] += len(lines) - 1 + end_pos[1] = len(lines[-1]) + + endmarker = EndMarker('', tuple(end_pos), self.prefix + self._last_prefix) + endmarker.parent = self._module + self._module.children.append(endmarker) diff --git a/parso/python/grammar2.7.txt b/parso/python/grammar2.7.txt new file mode 100644 index 0000000..515dea6 --- /dev/null +++ b/parso/python/grammar2.7.txt @@ -0,0 +1,152 @@ +# Grammar for 2to3. This grammar supports Python 2.x and 3.x. + +# Note: Changing the grammar specified in this file will most likely +# require corresponding changes in the parser module +# (../Modules/parsermodule.c). If you can't make the changes to +# that module yourself, please co-ordinate the required changes +# with someone who can; ask around on python-dev for help. Fred +# Drake will probably be listening there. + +# NOTE WELL: You should also follow all the steps listed in PEP 306, +# "How to Change Python's Grammar" + + +# Start symbols for the grammar: +# file_input is a module or sequence of commands read from an input file; +# single_input is a single interactive statement; +# eval_input is the input for the eval() and input() functions. +# NB: compound_stmt in single_input is followed by extra NEWLINE! +file_input: (NEWLINE | stmt)* ENDMARKER +single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE +eval_input: testlist NEWLINE* ENDMARKER + +decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE +decorators: decorator+ +decorated: decorators (classdef | funcdef) +funcdef: 'def' NAME parameters ['->' test] ':' suite +parameters: '(' [typedargslist] ')' +typedargslist: ((tfpdef ['=' test] ',')* + ('*' [tname] (',' tname ['=' test])* [',' '**' tname] | '**' tname) + | tfpdef ['=' test] (',' tfpdef ['=' test])* [',']) +tname: NAME [':' test] +tfpdef: tname | '(' tfplist ')' +tfplist: tfpdef (',' tfpdef)* [','] +varargslist: ((vfpdef ['=' test] ',')* + ('*' [vname] (',' vname ['=' test])* [',' '**' vname] | '**' vname) + | vfpdef ['=' test] (',' vfpdef ['=' test])* [',']) +vname: NAME +vfpdef: vname | '(' vfplist ')' +vfplist: vfpdef (',' vfpdef)* [','] + +stmt: simple_stmt | compound_stmt +simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE +small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt | + import_stmt | global_stmt | exec_stmt | assert_stmt) +expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) | + ('=' (yield_expr|testlist_star_expr))*) +testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [','] +augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | + '<<=' | '>>=' | '**=' | '//=') +# For normal assignments, additional restrictions enforced by the interpreter +print_stmt: 'print' ( [ test (',' test)* [','] ] | + '>>' test [ (',' test)+ [','] ] ) +del_stmt: 'del' exprlist +pass_stmt: 'pass' +flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt +break_stmt: 'break' +continue_stmt: 'continue' +return_stmt: 'return' [testlist] +yield_stmt: yield_expr +raise_stmt: 'raise' [test [',' test [',' test]]] +import_stmt: import_name | import_from +import_name: 'import' dotted_as_names +# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS +import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+) + 'import' ('*' | '(' import_as_names ')' | import_as_names)) +import_as_name: NAME ['as' NAME] +dotted_as_name: dotted_name ['as' NAME] +import_as_names: import_as_name (',' import_as_name)* [','] +dotted_as_names: dotted_as_name (',' dotted_as_name)* +dotted_name: NAME ('.' NAME)* +global_stmt: 'global' NAME (',' NAME)* +exec_stmt: 'exec' expr ['in' test [',' test]] +assert_stmt: 'assert' test [',' test] + +compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated +if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite] +while_stmt: 'while' test ':' suite ['else' ':' suite] +for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite] +try_stmt: ('try' ':' suite + ((except_clause ':' suite)+ + ['else' ':' suite] + ['finally' ':' suite] | + 'finally' ':' suite)) +with_stmt: 'with' with_item (',' with_item)* ':' suite +with_item: test ['as' expr] +with_var: 'as' expr +# NB compile.c makes sure that the default except clause is last +except_clause: 'except' [test [(',' | 'as') test]] +# Edit by David Halter: The stmt is now optional. This reflects how Jedi allows +# classes and functions to be empty, which is beneficial for autocompletion. +suite: simple_stmt | NEWLINE INDENT stmt* DEDENT + +# Backward compatibility cruft to support: +# [ x for x in lambda: True, lambda: False if x() ] +# even while also allowing: +# lambda x: 5 if x else 2 +# (But not a mix of the two) +testlist_safe: old_test [(',' old_test)+ [',']] +old_test: or_test | old_lambdef +old_lambdef: 'lambda' [varargslist] ':' old_test + +test: or_test ['if' or_test 'else' test] | lambdef +or_test: and_test ('or' and_test)* +and_test: not_test ('and' not_test)* +not_test: 'not' not_test | comparison +comparison: expr (comp_op expr)* +comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' +star_expr: '*' expr +expr: xor_expr ('|' xor_expr)* +xor_expr: and_expr ('^' and_expr)* +and_expr: shift_expr ('&' shift_expr)* +shift_expr: arith_expr (('<<'|'>>') arith_expr)* +arith_expr: term (('+'|'-') term)* +term: factor (('*'|'/'|'%'|'//') factor)* +factor: ('+'|'-'|'~') factor | power +power: atom trailer* ['**' factor] +atom: ('(' [yield_expr|testlist_comp] ')' | + '[' [testlist_comp] ']' | + '{' [dictorsetmaker] '}' | + '`' testlist1 '`' | + NAME | NUMBER | STRING+ | '.' '.' '.') +# Modification by David Halter, remove `testlist_gexp` and `listmaker` +testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) +lambdef: 'lambda' [varargslist] ':' test +trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME +subscriptlist: subscript (',' subscript)* [','] +subscript: test | [test] ':' [test] [sliceop] +sliceop: ':' [test] +exprlist: (expr|star_expr) (',' (expr|star_expr))* [','] +testlist: test (',' test)* [','] +# Modification by David Halter, dictsetmaker -> dictorsetmaker (so that it's +# the same as in the 3.4 grammar). +dictorsetmaker: ( (test ':' test (comp_for | (',' test ':' test)* [','])) | + (test (comp_for | (',' test)* [','])) ) + +classdef: 'class' NAME ['(' [arglist] ')'] ':' suite + +arglist: (argument ',')* (argument [','] + |'*' test (',' argument)* [',' '**' test] + |'**' test) +argument: test [comp_for] | test '=' test # Really [keyword '='] test + +comp_iter: comp_for | comp_if +comp_for: 'for' exprlist 'in' testlist_safe [comp_iter] +comp_if: 'if' old_test [comp_iter] + +testlist1: test (',' test)* + +# not used in grammar, but may appear in "node" passed from Parser to Compiler +encoding_decl: NAME + +yield_expr: 'yield' [testlist] diff --git a/parso/python/grammar3.4.txt b/parso/python/grammar3.4.txt new file mode 100644 index 0000000..d4a32b8 --- /dev/null +++ b/parso/python/grammar3.4.txt @@ -0,0 +1,135 @@ +# Grammar for Python + +# Note: Changing the grammar specified in this file will most likely +# require corresponding changes in the parser module +# (../Modules/parsermodule.c). If you can't make the changes to +# that module yourself, please co-ordinate the required changes +# with someone who can; ask around on python-dev for help. Fred +# Drake will probably be listening there. + +# NOTE WELL: You should also follow all the steps listed in PEP 306, +# "How to Change Python's Grammar" + +# Start symbols for the grammar: +# single_input is a single interactive statement; +# file_input is a module or sequence of commands read from an input file; +# eval_input is the input for the eval() functions. +# NB: compound_stmt in single_input is followed by extra NEWLINE! +file_input: (NEWLINE | stmt)* ENDMARKER +single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE +eval_input: testlist NEWLINE* ENDMARKER + +decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE +decorators: decorator+ +decorated: decorators (classdef | funcdef) +funcdef: 'def' NAME parameters ['->' test] ':' suite +parameters: '(' [typedargslist] ')' +typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [',' + ['*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef]] + | '*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef) +tfpdef: NAME [':' test] +varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' + ['*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef]] + | '*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef) +vfpdef: NAME + +stmt: simple_stmt | compound_stmt +simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE +small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | + import_stmt | global_stmt | nonlocal_stmt | assert_stmt) +expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) | + ('=' (yield_expr|testlist_star_expr))*) +testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [','] +augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | + '<<=' | '>>=' | '**=' | '//=') +# For normal assignments, additional restrictions enforced by the interpreter +del_stmt: 'del' exprlist +pass_stmt: 'pass' +flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt +break_stmt: 'break' +continue_stmt: 'continue' +return_stmt: 'return' [testlist] +yield_stmt: yield_expr +raise_stmt: 'raise' [test ['from' test]] +import_stmt: import_name | import_from +import_name: 'import' dotted_as_names +# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS +import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+) + 'import' ('*' | '(' import_as_names ')' | import_as_names)) +import_as_name: NAME ['as' NAME] +dotted_as_name: dotted_name ['as' NAME] +import_as_names: import_as_name (',' import_as_name)* [','] +dotted_as_names: dotted_as_name (',' dotted_as_name)* +dotted_name: NAME ('.' NAME)* +global_stmt: 'global' NAME (',' NAME)* +nonlocal_stmt: 'nonlocal' NAME (',' NAME)* +assert_stmt: 'assert' test [',' test] + +compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated +if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite] +while_stmt: 'while' test ':' suite ['else' ':' suite] +for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite] +try_stmt: ('try' ':' suite + ((except_clause ':' suite)+ + ['else' ':' suite] + ['finally' ':' suite] | + 'finally' ':' suite)) +with_stmt: 'with' with_item (',' with_item)* ':' suite +with_item: test ['as' expr] +# NB compile.c makes sure that the default except clause is last +except_clause: 'except' [test ['as' NAME]] +# Edit by David Halter: The stmt is now optional. This reflects how Jedi allows +# classes and functions to be empty, which is beneficial for autocompletion. +suite: simple_stmt | NEWLINE INDENT stmt* DEDENT + +test: or_test ['if' or_test 'else' test] | lambdef +test_nocond: or_test | lambdef_nocond +lambdef: 'lambda' [varargslist] ':' test +lambdef_nocond: 'lambda' [varargslist] ':' test_nocond +or_test: and_test ('or' and_test)* +and_test: not_test ('and' not_test)* +not_test: 'not' not_test | comparison +comparison: expr (comp_op expr)* +# <> isn't actually a valid comparison operator in Python. It's here for the +# sake of a __future__ import described in PEP 401 +comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' +star_expr: '*' expr +expr: xor_expr ('|' xor_expr)* +xor_expr: and_expr ('^' and_expr)* +and_expr: shift_expr ('&' shift_expr)* +shift_expr: arith_expr (('<<'|'>>') arith_expr)* +arith_expr: term (('+'|'-') term)* +term: factor (('*'|'/'|'%'|'//') factor)* +factor: ('+'|'-'|'~') factor | power +power: atom trailer* ['**' factor] +atom: ('(' [yield_expr|testlist_comp] ')' | + '[' [testlist_comp] ']' | + '{' [dictorsetmaker] '}' | + NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') +testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) +trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME +subscriptlist: subscript (',' subscript)* [','] +subscript: test | [test] ':' [test] [sliceop] +sliceop: ':' [test] +exprlist: (expr|star_expr) (',' (expr|star_expr))* [','] +testlist: test (',' test)* [','] +dictorsetmaker: ( (test ':' test (comp_for | (',' test ':' test)* [','])) | + (test (comp_for | (',' test)* [','])) ) + +classdef: 'class' NAME ['(' [arglist] ')'] ':' suite + +arglist: (argument ',')* (argument [','] + |'*' test (',' argument)* [',' '**' test] + |'**' test) +# The reason that keywords are test nodes instead of NAME is that using NAME +# results in an ambiguity. ast.c makes sure it's a NAME. +argument: test [comp_for] | test '=' test # Really [keyword '='] test +comp_iter: comp_for | comp_if +comp_for: 'for' exprlist 'in' or_test [comp_iter] +comp_if: 'if' test_nocond [comp_iter] + +# not used in grammar, but may appear in "node" passed from Parser to Compiler +encoding_decl: NAME + +yield_expr: 'yield' [yield_arg] +yield_arg: 'from' test | testlist diff --git a/parso/python/grammar3.5.txt b/parso/python/grammar3.5.txt new file mode 100644 index 0000000..96a7271 --- /dev/null +++ b/parso/python/grammar3.5.txt @@ -0,0 +1,154 @@ +# Grammar for Python + +# Note: Changing the grammar specified in this file will most likely +# require corresponding changes in the parser module +# (../Modules/parsermodule.c). If you can't make the changes to +# that module yourself, please co-ordinate the required changes +# with someone who can; ask around on python-dev for help. Fred +# Drake will probably be listening there. + +# NOTE WELL: You should also follow all the steps listed at +# https://docs.python.org/devguide/grammar.html + +# Start symbols for the grammar: +# single_input is a single interactive statement; +# file_input is a module or sequence of commands read from an input file; +# eval_input is the input for the eval() functions. +# NB: compound_stmt in single_input is followed by extra NEWLINE! +file_input: (NEWLINE | stmt)* ENDMARKER +single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE +eval_input: testlist NEWLINE* ENDMARKER + +decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE +decorators: decorator+ +decorated: decorators (classdef | funcdef | async_funcdef) + +# NOTE: Reinoud Elhorst, using ASYNC/AWAIT keywords instead of tokens +# skipping python3.5 compatibility, in favour of 3.7 solution +async_funcdef: 'async' funcdef +funcdef: 'def' NAME parameters ['->' test] ':' suite + +parameters: '(' [typedargslist] ')' +typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [',' + ['*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef]] + | '*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef) +tfpdef: NAME [':' test] +varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' + ['*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef]] + | '*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef) +vfpdef: NAME + +stmt: simple_stmt | compound_stmt +simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE +small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | + import_stmt | global_stmt | nonlocal_stmt | assert_stmt) +expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) | + ('=' (yield_expr|testlist_star_expr))*) +testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [','] +augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' | + '<<=' | '>>=' | '**=' | '//=') +# For normal assignments, additional restrictions enforced by the interpreter +del_stmt: 'del' exprlist +pass_stmt: 'pass' +flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt +break_stmt: 'break' +continue_stmt: 'continue' +return_stmt: 'return' [testlist] +yield_stmt: yield_expr +raise_stmt: 'raise' [test ['from' test]] +import_stmt: import_name | import_from +import_name: 'import' dotted_as_names +# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS +import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+) + 'import' ('*' | '(' import_as_names ')' | import_as_names)) +import_as_name: NAME ['as' NAME] +dotted_as_name: dotted_name ['as' NAME] +import_as_names: import_as_name (',' import_as_name)* [','] +dotted_as_names: dotted_as_name (',' dotted_as_name)* +dotted_name: NAME ('.' NAME)* +global_stmt: 'global' NAME (',' NAME)* +nonlocal_stmt: 'nonlocal' NAME (',' NAME)* +assert_stmt: 'assert' test [',' test] + +compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt +async_stmt: 'async' (funcdef | with_stmt | for_stmt) +if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite] +while_stmt: 'while' test ':' suite ['else' ':' suite] +for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite] +try_stmt: ('try' ':' suite + ((except_clause ':' suite)+ + ['else' ':' suite] + ['finally' ':' suite] | + 'finally' ':' suite)) +with_stmt: 'with' with_item (',' with_item)* ':' suite +with_item: test ['as' expr] +# NB compile.c makes sure that the default except clause is last +except_clause: 'except' [test ['as' NAME]] +# Edit by David Halter: The stmt is now optional. This reflects how Jedi allows +# classes and functions to be empty, which is beneficial for autocompletion. +suite: simple_stmt | NEWLINE INDENT stmt* DEDENT + +test: or_test ['if' or_test 'else' test] | lambdef +test_nocond: or_test | lambdef_nocond +lambdef: 'lambda' [varargslist] ':' test +lambdef_nocond: 'lambda' [varargslist] ':' test_nocond +or_test: and_test ('or' and_test)* +and_test: not_test ('and' not_test)* +not_test: 'not' not_test | comparison +comparison: expr (comp_op expr)* +# <> isn't actually a valid comparison operator in Python. It's here for the +# sake of a __future__ import described in PEP 401 (which really works :-) +comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' +star_expr: '*' expr +expr: xor_expr ('|' xor_expr)* +xor_expr: and_expr ('^' and_expr)* +and_expr: shift_expr ('&' shift_expr)* +shift_expr: arith_expr (('<<'|'>>') arith_expr)* +arith_expr: term (('+'|'-') term)* +term: factor (('*'|'@'|'/'|'%'|'//') factor)* +factor: ('+'|'-'|'~') factor | power +power: atom_expr ['**' factor] +atom_expr: ['await'] atom trailer* +atom: ('(' [yield_expr|testlist_comp] ')' | + '[' [testlist_comp] ']' | + '{' [dictorsetmaker] '}' | + NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') +testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) +trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME +subscriptlist: subscript (',' subscript)* [','] +subscript: test | [test] ':' [test] [sliceop] +sliceop: ':' [test] +exprlist: (expr|star_expr) (',' (expr|star_expr))* [','] +testlist: test (',' test)* [','] +dictorsetmaker: ( ((test ':' test | '**' expr) + (comp_for | (',' (test ':' test | '**' expr))* [','])) | + ((test | star_expr) + (comp_for | (',' (test | star_expr))* [','])) ) + +classdef: 'class' NAME ['(' [arglist] ')'] ':' suite + +arglist: argument (',' argument)* [','] + +# The reason that keywords are test nodes instead of NAME is that using NAME +# results in an ambiguity. ast.c makes sure it's a NAME. +# "test '=' test" is really "keyword '=' test", but we have no such token. +# These need to be in a single rule to avoid grammar that is ambiguous +# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr, +# we explicitly match '*' here, too, to give it proper precedence. +# Illegal combinations and orderings are blocked in ast.c: +# multiple (test comp_for) arguements are blocked; keyword unpackings +# that precede iterable unpackings are blocked; etc. +argument: ( test [comp_for] | + test '=' test | + '**' test | + '*' test ) + +comp_iter: comp_for | comp_if +comp_for: 'for' exprlist 'in' or_test [comp_iter] +comp_if: 'if' test_nocond [comp_iter] + +# not used in grammar, but may appear in "node" passed from Parser to Compiler +encoding_decl: NAME + +yield_expr: 'yield' [yield_arg] +yield_arg: 'from' test | testlist diff --git a/parso/python/grammar3.6.txt b/parso/python/grammar3.6.txt new file mode 100644 index 0000000..b44a569 --- /dev/null +++ b/parso/python/grammar3.6.txt @@ -0,0 +1,161 @@ +# Grammar for Python + +# Note: Changing the grammar specified in this file will most likely +# require corresponding changes in the parser module +# (../Modules/parsermodule.c). If you can't make the changes to +# that module yourself, please co-ordinate the required changes +# with someone who can; ask around on python-dev for help. Fred +# Drake will probably be listening there. + +# NOTE WELL: You should also follow all the steps listed at +# https://docs.python.org/devguide/grammar.html + +# Start symbols for the grammar: +# file_input is a module or sequence of commands read from an input file; +# single_input is a single interactive statement; +# eval_input is the input for the eval() functions. +# NB: compound_stmt in single_input is followed by extra NEWLINE! +file_input: (NEWLINE | stmt)* ENDMARKER +single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE +eval_input: testlist NEWLINE* ENDMARKER + +decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE +decorators: decorator+ +decorated: decorators (classdef | funcdef | async_funcdef) + +# NOTE: Francisco Souza/Reinoud Elhorst, using ASYNC/'await' keywords instead of +# skipping python3.5+ compatibility, in favour of 3.7 solution +async_funcdef: 'async' funcdef +funcdef: 'def' NAME parameters ['->' test] ':' suite + +parameters: '(' [typedargslist] ')' +typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [',' [ + '*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]] + | '**' tfpdef [',']]] + | '*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]] + | '**' tfpdef [',']) +tfpdef: NAME [':' test] +varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [ + '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]] + | '**' vfpdef [',']]] + | '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]] + | '**' vfpdef [','] +) +vfpdef: NAME + +stmt: simple_stmt | compound_stmt +simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE +small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | + import_stmt | global_stmt | nonlocal_stmt | assert_stmt) +expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) | + ('=' (yield_expr|testlist_star_expr))*) +annassign: ':' test ['=' test] +testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [','] +augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' | + '<<=' | '>>=' | '**=' | '//=') +# For normal and annotated assignments, additional restrictions enforced by the interpreter +del_stmt: 'del' exprlist +pass_stmt: 'pass' +flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt +break_stmt: 'break' +continue_stmt: 'continue' +return_stmt: 'return' [testlist] +yield_stmt: yield_expr +raise_stmt: 'raise' [test ['from' test]] +import_stmt: import_name | import_from +import_name: 'import' dotted_as_names +# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS +import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+) + 'import' ('*' | '(' import_as_names ')' | import_as_names)) +import_as_name: NAME ['as' NAME] +dotted_as_name: dotted_name ['as' NAME] +import_as_names: import_as_name (',' import_as_name)* [','] +dotted_as_names: dotted_as_name (',' dotted_as_name)* +dotted_name: NAME ('.' NAME)* +global_stmt: 'global' NAME (',' NAME)* +nonlocal_stmt: 'nonlocal' NAME (',' NAME)* +assert_stmt: 'assert' test [',' test] + +compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt +async_stmt: 'async' (funcdef | with_stmt | for_stmt) +if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite] +while_stmt: 'while' test ':' suite ['else' ':' suite] +for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite] +try_stmt: ('try' ':' suite + ((except_clause ':' suite)+ + ['else' ':' suite] + ['finally' ':' suite] | + 'finally' ':' suite)) +with_stmt: 'with' with_item (',' with_item)* ':' suite +with_item: test ['as' expr] +# NB compile.c makes sure that the default except clause is last +except_clause: 'except' [test ['as' NAME]] +# Edit by Francisco Souza/David Halter: The stmt is now optional. This reflects +# how Jedi allows classes and functions to be empty, which is beneficial for +# autocompletion. +suite: simple_stmt | NEWLINE INDENT stmt* DEDENT + +test: or_test ['if' or_test 'else' test] | lambdef +test_nocond: or_test | lambdef_nocond +lambdef: 'lambda' [varargslist] ':' test +lambdef_nocond: 'lambda' [varargslist] ':' test_nocond +or_test: and_test ('or' and_test)* +and_test: not_test ('and' not_test)* +not_test: 'not' not_test | comparison +comparison: expr (comp_op expr)* +# <> isn't actually a valid comparison operator in Python. It's here for the +# sake of a __future__ import described in PEP 401 (which really works :-) +comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' +star_expr: '*' expr +expr: xor_expr ('|' xor_expr)* +xor_expr: and_expr ('^' and_expr)* +and_expr: shift_expr ('&' shift_expr)* +shift_expr: arith_expr (('<<'|'>>') arith_expr)* +arith_expr: term (('+'|'-') term)* +term: factor (('*'|'@'|'/'|'%'|'//') factor)* +factor: ('+'|'-'|'~') factor | power +power: atom_expr ['**' factor] +atom_expr: ['await'] atom trailer* +atom: ('(' [yield_expr|testlist_comp] ')' | + '[' [testlist_comp] ']' | + '{' [dictorsetmaker] '}' | + NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False') +testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) +trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME +subscriptlist: subscript (',' subscript)* [','] +subscript: test | [test] ':' [test] [sliceop] +sliceop: ':' [test] +exprlist: (expr|star_expr) (',' (expr|star_expr))* [','] +testlist: test (',' test)* [','] +dictorsetmaker: ( ((test ':' test | '**' expr) + (comp_for | (',' (test ':' test | '**' expr))* [','])) | + ((test | star_expr) + (comp_for | (',' (test | star_expr))* [','])) ) + +classdef: 'class' NAME ['(' [arglist] ')'] ':' suite + +arglist: argument (',' argument)* [','] + +# The reason that keywords are test nodes instead of NAME is that using NAME +# results in an ambiguity. ast.c makes sure it's a NAME. +# "test '=' test" is really "keyword '=' test", but we have no such token. +# These need to be in a single rule to avoid grammar that is ambiguous +# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr, +# we explicitly match '*' here, too, to give it proper precedence. +# Illegal combinations and orderings are blocked in ast.c: +# multiple (test comp_for) arguments are blocked; keyword unpackings +# that precede iterable unpackings are blocked; etc. +argument: ( test [comp_for] | + test '=' test | + '**' test | + '*' test ) + +comp_iter: comp_for | comp_if +comp_for: ['async'] 'for' exprlist 'in' or_test [comp_iter] +comp_if: 'if' test_nocond [comp_iter] + +# not used in grammar, but may appear in "node" passed from Parser to Compiler +encoding_decl: NAME + +yield_expr: 'yield' [yield_arg] +yield_arg: 'from' test | testlist diff --git a/parso/python/parser.py b/parso/python/parser.py new file mode 100644 index 0000000..bdd564c --- /dev/null +++ b/parso/python/parser.py @@ -0,0 +1,232 @@ +from parso.python import tree +from parso import tokenize +from parso.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER, + STRING, tok_name) +from parso.parser import BaseParser +from jedi.common import splitlines + + +class Parser(BaseParser): + """ + This class is used to parse a Python file, it then divides them into a + class structure of different scopes. + + :param grammar: The grammar object of pgen2. Loaded by load_grammar. + """ + + node_map = { + 'expr_stmt': tree.ExprStmt, + 'classdef': tree.Class, + 'funcdef': tree.Function, + 'file_input': tree.Module, + 'import_name': tree.ImportName, + 'import_from': tree.ImportFrom, + 'break_stmt': tree.KeywordStatement, + 'continue_stmt': tree.KeywordStatement, + 'return_stmt': tree.ReturnStmt, + 'raise_stmt': tree.KeywordStatement, + 'yield_expr': tree.YieldExpr, + 'del_stmt': tree.KeywordStatement, + 'pass_stmt': tree.KeywordStatement, + 'global_stmt': tree.GlobalStmt, + 'nonlocal_stmt': tree.KeywordStatement, + 'print_stmt': tree.KeywordStatement, + 'assert_stmt': tree.AssertStmt, + 'if_stmt': tree.IfStmt, + 'with_stmt': tree.WithStmt, + 'for_stmt': tree.ForStmt, + 'while_stmt': tree.WhileStmt, + 'try_stmt': tree.TryStmt, + 'comp_for': tree.CompFor, + 'decorator': tree.Decorator, + 'lambdef': tree.Lambda, + 'old_lambdef': tree.Lambda, + 'lambdef_nocond': tree.Lambda, + } + default_node = tree.PythonNode + + def __init__(self, grammar, error_recovery=True, start_symbol='file_input'): + super(Parser, self).__init__(grammar, start_symbol, error_recovery=error_recovery) + + self.syntax_errors = [] + self._omit_dedent_list = [] + self._indent_counter = 0 + + # TODO do print absolute import detection here. + # try: + # del python_grammar_no_print_statement.keywords["print"] + # except KeyError: + # pass # Doesn't exist in the Python 3 grammar. + + # if self.options["print_function"]: + # python_grammar = pygram.python_grammar_no_print_statement + # else: + + def parse(self, tokens): + if self._error_recovery: + if self._start_symbol != 'file_input': + raise NotImplementedError + + tokens = self._recovery_tokenize(tokens) + + node = super(Parser, self).parse(tokens) + + if self._start_symbol == 'file_input' != node.type: + # If there's only one statement, we get back a non-module. That's + # not what we want, we want a module, so we add it here: + node = self.convert_node( + self._grammar, + self._grammar.symbol2number['file_input'], + [node] + ) + + return node + + def convert_node(self, grammar, type, children): + """ + Convert raw node information to a PythonBaseNode instance. + + This is passed to the parser driver which calls it whenever a reduction of a + grammar rule produces a new complete node, so that the tree is build + strictly bottom-up. + """ + # TODO REMOVE symbol, we don't want type here. + symbol = grammar.number2symbol[type] + try: + return self.node_map[symbol](children) + except KeyError: + if symbol == 'suite': + # We don't want the INDENT/DEDENT in our parser tree. Those + # leaves are just cancer. They are virtual leaves and not real + # ones and therefore have pseudo start/end positions and no + # prefixes. Just ignore them. + children = [children[0]] + children[2:-1] + return self.default_node(symbol, children) + + def convert_leaf(self, grammar, type, value, prefix, start_pos): + # print('leaf', repr(value), token.tok_name[type]) + if type == tokenize.NAME: + if value in grammar.keywords: + return tree.Keyword(value, start_pos, prefix) + else: + return tree.Name(value, start_pos, prefix) + elif type == STRING: + return tree.String(value, start_pos, prefix) + elif type == NUMBER: + return tree.Number(value, start_pos, prefix) + elif type == NEWLINE: + return tree.Newline(value, start_pos, prefix) + elif type == ENDMARKER: + return tree.EndMarker(value, start_pos, prefix) + else: + return tree.Operator(value, start_pos, prefix) + + def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix, + add_token_callback): + """ + This parser is written in a dynamic way, meaning that this parser + allows using different grammars (even non-Python). However, error + recovery is purely written for Python. + """ + if not self._error_recovery: + return super(Parser, self).error_recovery( + grammar, stack, arcs, typ, value, start_pos, prefix, + add_token_callback) + + def current_suite(stack): + # For now just discard everything that is not a suite or + # file_input, if we detect an error. + for index, (dfa, state, (type_, nodes)) in reversed(list(enumerate(stack))): + # `suite` can sometimes be only simple_stmt, not stmt. + symbol = grammar.number2symbol[type_] + if symbol == 'file_input': + break + elif symbol == 'suite' and len(nodes) > 1: + # suites without an indent in them get discarded. + break + return index, symbol, nodes + + index, symbol, nodes = current_suite(stack) + + # print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index) + if self._stack_removal(grammar, stack, arcs, index + 1, value, start_pos): + add_token_callback(typ, value, start_pos, prefix) + else: + if typ == INDENT: + # For every deleted INDENT we have to delete a DEDENT as well. + # Otherwise the parser will get into trouble and DEDENT too early. + self._omit_dedent_list.append(self._indent_counter) + else: + error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix) + stack[-1][2][1].append(error_leaf) + + def _stack_removal(self, grammar, stack, arcs, start_index, value, start_pos): + failed_stack = [] + found = False + all_nodes = [] + for dfa, state, (typ, nodes) in stack[start_index:]: + if nodes: + found = True + if found: + symbol = grammar.number2symbol[typ] + failed_stack.append((symbol, nodes)) + all_nodes += nodes + if failed_stack: + stack[start_index - 1][2][1].append(tree.PythonErrorNode(all_nodes)) + + stack[start_index:] = [] + return failed_stack + + def _recovery_tokenize(self, tokens): + for typ, value, start_pos, prefix in tokens: + # print(tokenize.tok_name[typ], repr(value), start_pos, repr(prefix)) + if typ == DEDENT: + # We need to count indents, because if we just omit any DEDENT, + # we might omit them in the wrong place. + o = self._omit_dedent_list + if o and o[-1] == self._indent_counter: + o.pop() + continue + + self._indent_counter -= 1 + elif typ == INDENT: + self._indent_counter += 1 + + yield typ, value, start_pos, prefix + + +def _remove_last_newline(node): + endmarker = node.children[-1] + # The newline is either in the endmarker as a prefix or the previous + # leaf as a newline token. + prefix = endmarker.prefix + leaf = endmarker.get_previous_leaf() + if prefix: + text = prefix + else: + if leaf is None: + raise ValueError("You're trying to remove a newline from an empty module.") + + text = leaf.value + + if not text.endswith('\n'): + raise ValueError("There's no newline at the end, cannot remove it.") + + text = text[:-1] + if prefix: + endmarker.prefix = text + + if leaf is None: + end_pos = (1, 0) + else: + end_pos = leaf.end_pos + + lines = splitlines(text, keepends=True) + if len(lines) == 1: + end_pos = end_pos[0], end_pos[1] + len(lines[0]) + else: + end_pos = end_pos[0] + len(lines) - 1, len(lines[-1]) + endmarker.start_pos = end_pos + else: + leaf.value = text + endmarker.start_pos = leaf.end_pos diff --git a/parso/python/tree.py b/parso/python/tree.py new file mode 100644 index 0000000..d41f639 --- /dev/null +++ b/parso/python/tree.py @@ -0,0 +1,1045 @@ +""" +If you know what an syntax tree is, you'll see that this module is pretty much +that. The classes represent syntax elements like functions and imports. + +This is the "business logic" part of the parser. There's a lot of logic here +that makes it easier for Jedi (and other libraries) to deal with a Python syntax +tree. + +By using `get_code` on a module, you can get back the 1-to-1 representation of +the input given to the parser. This is important if you are using refactoring. + +The easiest way to play with this module is to use :class:`parsing.Parser`. +:attr:`parsing.Parser.module` holds an instance of :class:`Module`: + +>>> from jedi.parser.python import parse +>>> parser = parse('import os') +>>> module = parser.get_root_node() +>>> module + + +Any subclasses of :class:`Scope`, including :class:`Module` has an attribute +:attr:`iter_imports `: + +>>> list(module.iter_imports()) +[] +""" + +from jedi._compatibility import utf8_repr, unicode +from parso.tree import Node, BaseNode, Leaf, ErrorNode, ErrorLeaf, \ + search_ancestor + + +class DocstringMixin(object): + __slots__ = () + + def get_doc_node(self): + """ + Returns the string leaf of a docstring. e.g. ``r'''foo'''``. + """ + if self.type == 'file_input': + node = self.children[0] + elif isinstance(self, ClassOrFunc): + node = self.children[self.children.index(':') + 1] + if node.type == 'suite': # Normally a suite + node = node.children[1] # -> NEWLINE stmt + else: # ExprStmt + simple_stmt = self.parent + c = simple_stmt.parent.children + index = c.index(simple_stmt) + if not index: + return None + node = c[index - 1] + + if node.type == 'simple_stmt': + node = node.children[0] + if node.type == 'string': + return node + return None + + +class PythonMixin(object): + """ + Some Python specific utitilies. + """ + __slots__ = () + + def get_definition(self): + if self.type in ('newline', 'endmarker'): + raise ValueError('Cannot get the indentation of whitespace or indentation.') + scope = self + while scope.parent is not None: + parent = scope.parent + if isinstance(scope, (PythonNode, PythonLeaf)) and parent.type != 'simple_stmt': + if scope.type == 'testlist_comp': + try: + if scope.children[1].type == 'comp_for': + return scope.children[1] + except IndexError: + pass + scope = parent + else: + break + return scope + + def get_name_of_position(self, position): + for c in self.children: + if isinstance(c, Leaf): + if c.type == 'name' and c.start_pos <= position <= c.end_pos: + return c + else: + result = c.get_name_of_position(position) + if result is not None: + return result + return None + + +class PythonLeaf(Leaf, PythonMixin): + __slots__ = () + + +class _LeafWithoutNewlines(PythonLeaf): + """ + Simply here to optimize performance. + """ + __slots__ = () + + @property + def end_pos(self): + return self.line, self.indent + len(self.value) + + +# Python base classes +class PythonBaseNode(BaseNode, PythonMixin): + __slots__ = () + + +class PythonNode(Node, PythonMixin): + __slots__ = () + + +class PythonErrorNode(ErrorNode, PythonMixin): + __slots__ = () + + +class PythonErrorLeaf(ErrorLeaf, PythonMixin): + __slots__ = () + + +class EndMarker(_LeafWithoutNewlines): + __slots__ = () + type = 'endmarker' + + +class Newline(PythonLeaf): + """Contains NEWLINE and ENDMARKER tokens.""" + __slots__ = () + type = 'newline' + + @utf8_repr + def __repr__(self): + return "<%s: %s>" % (type(self).__name__, repr(self.value)) + + +class Name(_LeafWithoutNewlines): + """ + A string. Sometimes it is important to know if the string belongs to a name + or not. + """ + type = 'name' + __slots__ = () + + def __repr__(self): + return "<%s: %s@%s,%s>" % (type(self).__name__, self.value, + self.line, self.indent) + + def is_definition(self): + if self.parent.type in ('power', 'atom_expr'): + # In `self.x = 3` self is not a definition, but x is. + return False + + stmt = self.get_definition() + if stmt.type in ('funcdef', 'classdef', 'param'): + return self == stmt.name + elif stmt.type == 'for_stmt': + return self.start_pos < stmt.children[2].start_pos + elif stmt.type == 'try_stmt': + return self.get_previous_sibling() == 'as' + else: + return stmt.type in ('expr_stmt', 'import_name', 'import_from', + 'comp_for', 'with_stmt') \ + and self in stmt.get_defined_names() + + +class Literal(PythonLeaf): + __slots__ = () + + +class Number(Literal): + type = 'number' + __slots__ = () + + +class String(Literal): + type = 'string' + __slots__ = () + + +class _StringComparisonMixin(object): + def __eq__(self, other): + """ + Make comparisons with strings easy. + Improves the readability of the parser. + """ + if isinstance(other, (str, unicode)): + return self.value == other + + return self is other + + def __ne__(self, other): + """Python 2 compatibility.""" + return not self.__eq__(other) + + def __hash__(self): + return hash(self.value) + + +class Operator(_LeafWithoutNewlines, _StringComparisonMixin): + type = 'operator' + __slots__ = () + + +class Keyword(_LeafWithoutNewlines, _StringComparisonMixin): + type = 'keyword' + __slots__ = () + + +class Scope(PythonBaseNode, DocstringMixin): + """ + Super class for the parser tree, which represents the state of a python + text file. + A Scope is either a function, class or lambda. + """ + __slots__ = () + + def __init__(self, children): + super(Scope, self).__init__(children) + + def iter_funcdefs(self): + """ + Returns a generator of `funcdef` nodes. + """ + return self._search_in_scope('funcdef') + + def iter_classdefs(self): + """ + Returns a generator of `classdef` nodes. + """ + return self._search_in_scope('classdef') + + def iter_imports(self): + """ + Returns a generator of `import_name` and `import_from` nodes. + """ + return self._search_in_scope('import_name', 'import_from') + + def _search_in_scope(self, *names): + def scan(children): + for element in children: + if element.type in names: + yield element + if element.type in ('suite', 'simple_stmt', 'decorated') \ + or isinstance(element, Flow): + for e in scan(element.children): + yield e + + return scan(self.children) + + def get_suite(self): + """ + Returns the part that is executed by the function. + """ + return self.children[-1] + + def __repr__(self): + try: + name = self.name.value + except AttributeError: + name = '' + + return "<%s: %s@%s-%s>" % (type(self).__name__, name, + self.start_pos[0], self.end_pos[0]) + + +class Module(Scope): + """ + The top scope, which is always a module. + Depending on the underlying parser this may be a full module or just a part + of a module. + """ + __slots__ = ('_used_names',) + type = 'file_input' + + def __init__(self, children): + super(Module, self).__init__(children) + self._used_names = None + + def iter_future_import_names(self): + """ + :return list of str: A list of future import names. + """ + # TODO this is a strange scan and not fully correct. I think Python's + # parser does it in a different way and scans for the first + # statement/import with a tokenizer (to check for syntax changes like + # the future print statement). + for imp in self.iter_imports(): + if imp.type == 'import_from' and imp.level == 0: + for path in imp.get_paths(): + names = [name.value for name in path] + if len(names) == 2 and names[0] == '__future__': + yield names[1] + + def has_explicit_absolute_import(self): + """ + Checks if imports in this module are explicitly absolute, i.e. there + is a ``__future__`` import. + :return bool: + """ + for name in self.iter_future_import_names(): + if name == 'absolute_import': + return True + return False + + def get_used_names(self): + """ + Returns all the `Name` leafs that exist in this module. Tihs includes + both definitions and references of names. + """ + if self._used_names is None: + # Don't directly use self._used_names to eliminate a lookup. + dct = {} + + def recurse(node): + try: + children = node.children + except AttributeError: + if node.type == 'name': + arr = dct.setdefault(node.value, []) + arr.append(node) + else: + for child in children: + recurse(child) + + recurse(self) + self._used_names = dct + return self._used_names + + +class Decorator(PythonBaseNode): + type = 'decorator' + __slots__ = () + + +class ClassOrFunc(Scope): + __slots__ = () + + @property + def name(self): + """ + Returns the `Name` leaf that defines the function or class name. + """ + return self.children[1] + + def get_decorators(self): + """ + :return list of Decorator: + """ + decorated = self.parent + if decorated.type == 'decorated': + if decorated.children[0].type == 'decorators': + return decorated.children[0].children + else: + return decorated.children[:1] + else: + return [] + + +class Class(ClassOrFunc): + """ + Used to store the parsed contents of a python class. + + :param name: The Class name. + :type name: str + :param supers: The super classes of a Class. + :type supers: list + :param start_pos: The start position (line, column) of the class. + :type start_pos: tuple(int, int) + """ + type = 'classdef' + __slots__ = () + + def __init__(self, children): + super(Class, self).__init__(children) + + def get_super_arglist(self): + """ + Returns the `arglist` node that defines the super classes. It returns + None if there are no arguments. + """ + if self.children[2] != '(': # Has no parentheses + return None + else: + if self.children[3] == ')': # Empty parentheses + return None + else: + return self.children[3] + + +def _create_params(parent, argslist_list): + """ + `argslist_list` is a list that can contain an argslist as a first item, but + most not. It's basically the items between the parameter brackets (which is + at most one item). + This function modifies the parser structure. It generates `Param` objects + from the normal ast. Those param objects do not exist in a normal ast, but + make the evaluation of the ast tree so much easier. + You could also say that this function replaces the argslist node with a + list of Param objects. + """ + def check_python2_nested_param(node): + """ + Python 2 allows params to look like ``def x(a, (b, c))``, which is + basically a way of unpacking tuples in params. Python 3 has ditched + this behavior. Jedi currently just ignores those constructs. + """ + return node.type == 'tfpdef' and node.children[0] == '(' + + try: + first = argslist_list[0] + except IndexError: + return [] + + if first.type in ('name', 'tfpdef'): + if check_python2_nested_param(first): + return [first] + else: + return [Param([first], parent)] + elif first == '*': + return [first] + else: # argslist is a `typedargslist` or a `varargslist`. + children = first.children + new_children = [] + start = 0 + # Start with offset 1, because the end is higher. + for end, child in enumerate(children + [None], 1): + if child is None or child == ',': + param_children = children[start:end] + if param_children: # Could as well be comma and then end. + if check_python2_nested_param(param_children[0]): + new_children += param_children + elif param_children[0] == '*' and param_children[1] == ',': + new_children += param_children + else: + new_children.append(Param(param_children, parent)) + start = end + return new_children + + +class Function(ClassOrFunc): + """ + Used to store the parsed contents of a python function. + + Children:: + + 0. + 1. + 2. parameter list (including open-paren and close-paren s) + 3. or 5. + 4. or 6. Node() representing function body + 3. -> (if annotation is also present) + 4. annotation (if present) + """ + type = 'funcdef' + + def __init__(self, children): + super(Function, self).__init__(children) + parameters = self.children[2] # After `def foo` + parameters.children[1:-1] = _create_params(parameters, parameters.children[1:-1]) + + def _get_param_nodes(self): + return self.children[2].children + + @property + def params(self): + """ + Returns a list of `Param()`. + """ + return [p for p in self._get_param_nodes() if p.type == 'param'] + + @property + def name(self): + return self.children[1] # First token after `def` + + def iter_yield_exprs(self): + """ + Returns a generator of `yield_expr`. + """ + # TODO This is incorrect, yields are also possible in a statement. + return self._search_in_scope('yield_expr') + + def iter_return_stmts(self): + """ + Returns a generator of `return_stmt`. + """ + return self._search_in_scope('return_stmt') + + def is_generator(self): + """ + :return bool: Checks if a function is a generator or not. + """ + return next(self.iter_yield_exprs(), None) is not None + + @property + def annotation(self): + """ + Returns the test node after `->` or `None` if there is no annotation. + """ + try: + if self.children[3] == "->": + return self.children[4] + assert self.children[3] == ":" + return None + except IndexError: + return None + +class Lambda(Function): + """ + Lambdas are basically trimmed functions, so give it the same interface. + + Children:: + + 0. + *. for each argument x + -2. + -1. Node() representing body + """ + type = 'lambdef' + __slots__ = () + + def __init__(self, children): + # We don't want to call the Function constructor, call its parent. + super(Function, self).__init__(children) + # Everything between `lambda` and the `:` operator is a parameter. + self.children[1:-2] = _create_params(self, self.children[1:-2]) + + @property + def name(self): + """ + Raises an AttributeError. Lambdas don't have a defined name. + """ + raise AttributeError("lambda is not named.") + + def _get_param_nodes(self): + return self.children[1:-2] + + @property + def annotation(self): + """ + Returns `None`, lambdas don't have annotations. + """ + return None + + def __repr__(self): + return "<%s@%s>" % (self.__class__.__name__, self.start_pos) + + +class Flow(PythonBaseNode): + __slots__ = () + + +class IfStmt(Flow): + type = 'if_stmt' + __slots__ = () + + def get_test_nodes(self): + """ + E.g. returns all the `test` nodes that are named as x, below: + + if x: + pass + elif x: + pass + """ + for i, c in enumerate(self.children): + if c in ('elif', 'if'): + yield self.children[i + 1] + + def get_corresponding_test_node(self, node): + """ + Searches for the branch in which the node is and returns the + corresponding test node (see function above). However if the node is in + the test node itself and not in the suite return None. + """ + start_pos = node.start_pos + for check_node in reversed(list(self.get_test_nodes())): + if check_node.start_pos < start_pos: + if start_pos < check_node.end_pos: + return None + # In this case the node is within the check_node itself, + # not in the suite + else: + return check_node + + def is_node_after_else(self, node): + """ + Checks if a node is defined after `else`. + """ + for c in self.children: + if c == 'else': + if node.start_pos > c.start_pos: + return True + else: + return False + + +class WhileStmt(Flow): + type = 'while_stmt' + __slots__ = () + + +class ForStmt(Flow): + type = 'for_stmt' + __slots__ = () + + def get_testlist(self): + """ + Returns the input node ``y`` from: ``for x in y:``. + """ + return self.children[3] + + +class TryStmt(Flow): + type = 'try_stmt' + __slots__ = () + + def get_except_clause_tests(self): + """ + Returns the ``test`` nodes found in ``except_clause`` nodes. + Returns ``[None]`` for except clauses without an exception given. + """ + for node in self.children: + # TODO this is not correct. We're not returning an except clause. + if node.type == 'except_clause': + yield node.children[1] + elif node == 'except': + yield None + + +class WithStmt(Flow): + type = 'with_stmt' + __slots__ = () + + def get_defined_names(self): + """ + Returns the a list of `Name` that the with statement defines. The + defined names are set after `as`. + """ + names = [] + for with_item in self.children[1:-2:2]: + # Check with items for 'as' names. + if with_item.type == 'with_item': + names += _defined_names(with_item.children[2]) + return names + + def get_context_manager_from_name(self, name): + # TODO Replace context_manager with test? + node = name.parent + if node.type != 'with_item': + raise ValueError('The name is not actually part of a with statement.') + return node.children[0] + + +class Import(PythonBaseNode): + __slots__ = () + + def get_path_for_name(self, name): + """ + The path is the list of names that leads to the searched name. + + :return list of Name: + """ + try: + # The name may be an alias. If it is, just map it back to the name. + name = self._aliases()[name] + except KeyError: + pass + + for path in self.get_paths(): + if name in path: + return path[:path.index(name) + 1] + raise ValueError('Name should be defined in the import itself') + + def is_nested(self): + return False # By default, sub classes may overwrite this behavior + + def is_star_import(self): + return self.children[-1] == '*' + + +class ImportFrom(Import): + type = 'import_from' + __slots__ = () + + def get_defined_names(self): + """ + Returns the a list of `Name` that the import defines. The + defined names are set after `import` or in case an alias - `as` - is + present that name is returned. + """ + return [alias or name for name, alias in self._as_name_tuples()] + + def _aliases(self): + """Mapping from alias to its corresponding name.""" + return dict((alias, name) for name, alias in self._as_name_tuples() + if alias is not None) + + def get_from_names(self): + for n in self.children[1:]: + if n not in ('.', '...'): + break + if n.type == 'dotted_name': # from x.y import + return n.children[::2] + elif n == 'import': # from . import + return [] + else: # from x import + return [n] + + @property + def level(self): + """The level parameter of ``__import__``.""" + level = 0 + for n in self.children[1:]: + if n in ('.', '...'): + level += len(n.value) + else: + break + return level + + def _as_name_tuples(self): + last = self.children[-1] + if last == ')': + last = self.children[-2] + elif last == '*': + return # No names defined directly. + + if last.type == 'import_as_names': + as_names = last.children[::2] + else: + as_names = [last] + for as_name in as_names: + if as_name.type == 'name': + yield as_name, None + else: + yield as_name.children[::2] # yields x, y -> ``x as y`` + + def get_paths(self): + """ + The import paths defined in an import statement. Typically an array + like this: ``[, ]``. + + :return list of list of Name: + """ + dotted = self.get_from_names() + + if self.children[-1] == '*': + return [dotted] + return [dotted + [name] for name, alias in self._as_name_tuples()] + + +class ImportName(Import): + """For ``import_name`` nodes. Covers normal imports without ``from``.""" + type = 'import_name' + __slots__ = () + + def get_defined_names(self): + """ + Returns the a list of `Name` that the import defines. The defined names + is always the first name after `import` or in case an alias - `as` - is + present that name is returned. + """ + return [alias or path[0] for path, alias in self._dotted_as_names()] + + @property + def level(self): + """The level parameter of ``__import__``.""" + return 0 # Obviously 0 for imports without from. + + def get_paths(self): + return [path for path, alias in self._dotted_as_names()] + + def _dotted_as_names(self): + """Generator of (list(path), alias) where alias may be None.""" + dotted_as_names = self.children[1] + if dotted_as_names.type == 'dotted_as_names': + as_names = dotted_as_names.children[::2] + else: + as_names = [dotted_as_names] + + for as_name in as_names: + if as_name.type == 'dotted_as_name': + alias = as_name.children[2] + as_name = as_name.children[0] + else: + alias = None + if as_name.type == 'name': + yield [as_name], alias + else: + # dotted_names + yield as_name.children[::2], alias + + def is_nested(self): + """ + This checks for the special case of nested imports, without aliases and + from statement:: + + import foo.bar + """ + return bool([1 for path, alias in self._dotted_as_names() + if alias is None and len(path) > 1]) + + def _aliases(self): + """ + :return list of Name: Returns all the alias + """ + return dict((alias, path[-1]) for path, alias in self._dotted_as_names() + if alias is not None) + + +class KeywordStatement(PythonBaseNode): + """ + For the following statements: `assert`, `del`, `global`, `nonlocal`, + `raise`, `return`, `yield`, `return`, `yield`. + + `pass`, `continue` and `break` are not in there, because they are just + simple keywords and the parser reduces it to a keyword. + """ + __slots__ = () + + @property + def type(self): + """ + Keyword statements start with the keyword and end with `_stmt`. You can + crosscheck this with the Python grammar. + """ + return '%s_stmt' % self.keyword + + @property + def keyword(self): + return self.children[0].value + + +class AssertStmt(KeywordStatement): + __slots__ = () + + @property + def assertion(self): + return self.children[1] + + +class GlobalStmt(KeywordStatement): + __slots__ = () + + def get_global_names(self): + return self.children[1::2] + + +class ReturnStmt(KeywordStatement): + __slots__ = () + + +class YieldExpr(PythonBaseNode): + type = 'yield_expr' + __slots__ = () + + +def _defined_names(current): + """ + A helper function to find the defined names in statements, for loops and + list comprehensions. + """ + names = [] + if current.type in ('testlist_star_expr', 'testlist_comp', 'exprlist'): + for child in current.children[::2]: + names += _defined_names(child) + elif current.type in ('atom', 'star_expr'): + names += _defined_names(current.children[1]) + elif current.type in ('power', 'atom_expr'): + if current.children[-2] != '**': # Just if there's no operation + trailer = current.children[-1] + if trailer.children[0] == '.': + names.append(trailer.children[1]) + else: + names.append(current) + return names + + +class ExprStmt(PythonBaseNode, DocstringMixin): + type = 'expr_stmt' + __slots__ = () + + def get_defined_names(self): + """ + Returns a list of `Name` defined before the `=` sign. + """ + names = [] + if self.children[1].type == 'annassign': + names = _defined_names(self.children[0]) + return [ + name + for i in range(0, len(self.children) - 2, 2) + if '=' in self.children[i + 1].value + for name in _defined_names(self.children[i]) + ] + names + + def get_rhs(self): + """Returns the right-hand-side of the equals.""" + return self.children[-1] + + def yield_operators(self): + """ + Returns a generator of `+=`, `=`, etc. or None if there is no operation. + """ + first = self.children[1] + if first.type == 'annassign': + if len(first.children) <= 2: + return # No operator is available, it's just PEP 484. + + first = first.children[2] + yield first + + for operator in self.children[3::2]: + yield operator + + +class Param(PythonBaseNode): + """ + It's a helper class that makes business logic with params much easier. The + Python grammar defines no ``param`` node. It defines it in a different way + that is not really suited to working with parameters. + """ + type = 'param' + + def __init__(self, children, parent): + super(Param, self).__init__(children) + self.parent = parent + for child in children: + child.parent = self + + @property + def star_count(self): + """ + Is `0` in case of `foo`, `1` in case of `*foo` or `2` in case of + `**foo`. + """ + first = self.children[0] + if first in ('*', '**'): + return len(first.value) + return 0 + + @property + def default(self): + """ + The default is the test node that appears after the `=`. Is `None` in + case no default is present. + """ + try: + return self.children[int(self.children[0] in ('*', '**')) + 2] + except IndexError: + return None + + @property + def annotation(self): + """ + The default is the test node that appears after `->`. Is `None` in case + no annotation is present. + """ + tfpdef = self._tfpdef() + if tfpdef.type == 'tfpdef': + assert tfpdef.children[1] == ":" + assert len(tfpdef.children) == 3 + annotation = tfpdef.children[2] + return annotation + else: + return None + + def _tfpdef(self): + """ + tfpdef: see grammar.txt. + """ + offset = int(self.children[0] in ('*', '**')) + return self.children[offset] + + @property + def name(self): + """ + The `Name` leaf of the param. + """ + if self._tfpdef().type == 'tfpdef': + return self._tfpdef().children[0] + else: + return self._tfpdef() + + @property + def position_index(self): + """ + Property for the positional index of a paramter. + """ + index = self.parent.children.index(self) + try: + keyword_only_index = self.parent.children.index('*') + if index > keyword_only_index: + # Skip the ` *, ` + index -= 2 + except ValueError: + pass + return index - 1 + + def get_parent_function(self): + """ + Returns the function/lambda of a parameter. + """ + return search_ancestor(self, 'funcdef', 'lambdef') + + def get_code(self, normalized=False, include_prefix=True, include_comma=True): + """ + Like all the other get_code functions, but includes the param + `include_comma`. + + :param include_comma bool: If enabled includes the comma in the string output. + """ + if include_comma: + return super(Param, self).get_code(normalized, include_prefix) + + children = self.children + if children[-1] == ',': + children = children[:-1] + return self._get_code_for_children( + children, + normalized=False, + include_prefix=include_prefix + ) + + def __repr__(self): + default = '' if self.default is None else '=%s' % self.default.get_code() + return '<%s: %s>' % (type(self).__name__, str(self._tfpdef()) + default) + + +class CompFor(PythonBaseNode): + type = 'comp_for' + __slots__ = () + + def get_defined_names(self): + """ + Returns the a list of `Name` that the comprehension defines. + """ + return _defined_names(self.children[1]) diff --git a/parso/token.py b/parso/token.py new file mode 100644 index 0000000..0cb846d --- /dev/null +++ b/parso/token.py @@ -0,0 +1,90 @@ +from __future__ import absolute_import + +from jedi._compatibility import is_py3, is_py35 +from token import * + + +COMMENT = N_TOKENS +tok_name[COMMENT] = 'COMMENT' +N_TOKENS += 1 + +NL = N_TOKENS +tok_name[NL] = 'NL' +N_TOKENS += 1 + +if is_py3: + BACKQUOTE = N_TOKENS + tok_name[BACKQUOTE] = 'BACKQUOTE' + N_TOKENS += 1 +else: + RARROW = N_TOKENS + tok_name[RARROW] = 'RARROW' + N_TOKENS += 1 + ELLIPSIS = N_TOKENS + tok_name[ELLIPSIS] = 'ELLIPSIS' + N_TOKENS += 1 + +if not is_py35: + ATEQUAL = N_TOKENS + tok_name[ATEQUAL] = 'ATEQUAL' + N_TOKENS += 1 + + + +# Map from operator to number (since tokenize doesn't do this) + +opmap_raw = """\ +( LPAR +) RPAR +[ LSQB +] RSQB +: COLON +, COMMA +; SEMI ++ PLUS +- MINUS +* STAR +/ SLASH +| VBAR +& AMPER +< LESS +> GREATER += EQUAL +. DOT +% PERCENT +` BACKQUOTE +{ LBRACE +} RBRACE +@ AT +== EQEQUAL +!= NOTEQUAL +<> NOTEQUAL +<= LESSEQUAL +>= GREATEREQUAL +~ TILDE +^ CIRCUMFLEX +<< LEFTSHIFT +>> RIGHTSHIFT +** DOUBLESTAR ++= PLUSEQUAL +-= MINEQUAL +*= STAREQUAL +/= SLASHEQUAL +%= PERCENTEQUAL +&= AMPEREQUAL +|= VBAREQUAL +@= ATEQUAL +^= CIRCUMFLEXEQUAL +<<= LEFTSHIFTEQUAL +>>= RIGHTSHIFTEQUAL +**= DOUBLESTAREQUAL +// DOUBLESLASH +//= DOUBLESLASHEQUAL +-> RARROW +... ELLIPSIS +""" + +opmap = {} +for line in opmap_raw.splitlines(): + op, name = line.split() + opmap[op] = globals()[name] diff --git a/parso/tokenize.py b/parso/tokenize.py new file mode 100644 index 0000000..ee00717 --- /dev/null +++ b/parso/tokenize.py @@ -0,0 +1,369 @@ +# -*- coding: utf-8 -*- +""" +This tokenizer has been copied from the ``tokenize.py`` standard library +tokenizer. The reason was simple: The standard library tokenizer fails +if the indentation is not right. The fast parser of jedi however requires +"wrong" indentation. + +Basically this is a stripped down version of the standard library module, so +you can read the documentation there. Additionally we included some speed and +memory optimizations here. +""" +from __future__ import absolute_import + +import string +import re +from collections import namedtuple +import itertools as _itertools + +from parso.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap, + NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT) +from jedi._compatibility import is_py3, py_version, u +from jedi.common import splitlines + + +cookie_re = re.compile("coding[:=]\s*([-\w.]+)") + + +if is_py3: + # Python 3 has str.isidentifier() to check if a char is a valid identifier + is_identifier = str.isidentifier +else: + namechars = string.ascii_letters + '_' + is_identifier = lambda s: s in namechars + + +COMMENT = N_TOKENS +tok_name[COMMENT] = 'COMMENT' + + +def group(*choices, **kwargs): + capture = kwargs.pop('capture', False) # Python 2, arrghhhhh :( + assert not kwargs + + start = '(' + if not capture: + start += '?:' + return start + '|'.join(choices) + ')' + +def any(*choices): + return group(*choices) + '*' + +def maybe(*choices): + return group(*choices) + '?' + +# Note: we use unicode matching for names ("\w") but ascii matching for +# number literals. +Whitespace = r'[ \f\t]*' +Comment = r'#[^\r\n]*' +Name = r'\w+' + +if py_version >= 36: + Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' + Binnumber = r'0[bB](?:_?[01])+' + Octnumber = r'0[oO](?:_?[0-7])+' + Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' + Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) + Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' + Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', + r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) + Expfloat = r'[0-9](?:_?[0-9])*' + Exponent + Floatnumber = group(Pointfloat, Expfloat) + Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') +else: + Hexnumber = r'0[xX][0-9a-fA-F]+' + Binnumber = r'0[bB][01]+' + if is_py3: + Octnumber = r'0[oO][0-7]+' + else: + Octnumber = '0[0-7]+' + Decnumber = r'(?:0+|[1-9][0-9]*)' + Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) + Exponent = r'[eE][-+]?[0-9]+' + Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) + Expfloat = r'[0-9]+' + Exponent + Floatnumber = group(Pointfloat, Expfloat) + Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') +Number = group(Imagnumber, Floatnumber, Intnumber) + +# Return the empty string, plus all of the valid string prefixes. +def _all_string_prefixes(): + # The valid string prefixes. Only contain the lower case versions, + # and don't contain any permuations (include 'fr', but not + # 'rf'). The various permutations will be generated. + _valid_string_prefixes = ['b', 'r', 'u', 'br'] + if py_version >= 36: + _valid_string_prefixes += ['f', 'fr'] + if py_version <= 27: + # TODO this is actually not 100% valid. ur is valid in Python 2.7, + # while ru is not. + _valid_string_prefixes.append('ur') + + # if we add binary f-strings, add: ['fb', 'fbr'] + result = set(['']) + for prefix in _valid_string_prefixes: + for t in _itertools.permutations(prefix): + # create a list with upper and lower versions of each + # character + for u in _itertools.product(*[(c, c.upper()) for c in t]): + result.add(''.join(u)) + return result + +def _compile(expr): + return re.compile(expr, re.UNICODE) + +# Note that since _all_string_prefixes includes the empty string, +# StringPrefix can be the empty string (making it optional). +StringPrefix = group(*_all_string_prefixes()) + +# Tail end of ' string. +Single = r"[^'\\]*(?:\\.[^'\\]*)*'" +# Tail end of " string. +Double = r'[^"\\]*(?:\\.[^"\\]*)*"' +# Tail end of ''' string. +Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" +# Tail end of """ string. +Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' +Triple = group(StringPrefix + "'''", StringPrefix + '"""') + +# Because of leftmost-then-longest match semantics, be sure to put the +# longest operators first (e.g., if = came before ==, == would get +# recognized as two instances of =). +Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", + r"//=?", r"->", + r"[+\-*/%&@|^=<>]=?", + r"~") + +Bracket = '[][(){}]' +Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') +Funny = group(Operator, Bracket, Special) + +PlainToken = group(Number, Funny, Name, capture=True) + +# First (or only) line of ' or " string. +ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + + group("'", r'\\\r?\n'), + StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + + group('"', r'\\\r?\n')) +PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) +PseudoToken = group(Whitespace, capture=True) + \ + group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) + +# For a given string prefix plus quotes, endpats maps it to a regex +# to match the remainder of that string. _prefix can be empty, for +# a normal single or triple quoted string (with no prefix). +endpats = {} +for _prefix in _all_string_prefixes(): + endpats[_prefix + "'"] = _compile(Single) + endpats[_prefix + '"'] = _compile(Double) + endpats[_prefix + "'''"] = _compile(Single3) + endpats[_prefix + '"""'] = _compile(Double3) + +# A set of all of the single and triple quoted string prefixes, +# including the opening quotes. +single_quoted = set() +triple_quoted = set() +for t in _all_string_prefixes(): + for p in (t + '"', t + "'"): + single_quoted.add(p) + for p in (t + '"""', t + "'''"): + triple_quoted.add(p) + + +# TODO add with? +ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', + 'finally', 'while', 'return') +pseudo_token_compiled = _compile(PseudoToken) + + +class TokenInfo(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])): + def __repr__(self): + return ('TokenInfo(type=%s, string=%r, start=%r, prefix=%r)' % + self._replace(type=self.get_type_name())) + + def get_type_name(self, exact=True): + if exact: + typ = self.exact_type + else: + typ = self.type + return tok_name[typ] + + @property + def exact_type(self): + if self.type == OP and self.string in opmap: + return opmap[self.string] + else: + return self.type + + @property + def end_pos(self): + lines = splitlines(self.string) + if len(lines) > 1: + return self.start_pos[0] + len(lines) - 1, 0 + else: + return self.start_pos[0], self.start_pos[1] + len(self.string) + + +def source_tokens(source, use_exact_op_types=False): + """Generate tokens from a the source code (string).""" + lines = splitlines(source, keepends=True) + return generate_tokens(lines, use_exact_op_types) + + +def generate_tokens(lines, use_exact_op_types=False): + """ + A heavily modified Python standard library tokenizer. + + Additionally to the default information, yields also the prefix of each + token. This idea comes from lib2to3. The prefix contains all information + that is irrelevant for the parser like newlines in parentheses or comments. + """ + paren_level = 0 # count parentheses + indents = [0] + max = 0 + numchars = '0123456789' + contstr = '' + contline = None + # We start with a newline. This makes indent at the first position + # possible. It's not valid Python, but still better than an INDENT in the + # second line (and not in the first). This makes quite a few things in + # Jedi's fast parser possible. + new_line = True + prefix = '' # Should never be required, but here for safety + additional_prefix = '' + for lnum, line in enumerate(lines, 1): # loop over lines in stream + pos, max = 0, len(line) + + if contstr: # continued string + endmatch = endprog.match(line) + if endmatch: + pos = endmatch.end(0) + yield TokenInfo(STRING, contstr + line[:pos], contstr_start, prefix) + contstr = '' + contline = None + else: + contstr = contstr + line + contline = contline + line + continue + + while pos < max: + pseudomatch = pseudo_token_compiled.match(line, pos) + if not pseudomatch: # scan for tokens + txt = line[pos:] + if txt.endswith('\n'): + new_line = True + yield TokenInfo(ERRORTOKEN, txt, (lnum, pos), prefix) + break + + prefix = additional_prefix + pseudomatch.group(1) + additional_prefix = '' + start, pos = pseudomatch.span(2) + spos = (lnum, start) + token = pseudomatch.group(2) + initial = token[0] + + if new_line and initial not in '\r\n#': + new_line = False + if paren_level == 0: + i = 0 + while line[i] == '\f': + i += 1 + start -= 1 + if start > indents[-1]: + yield TokenInfo(INDENT, '', spos, '') + indents.append(start) + while start < indents[-1]: + yield TokenInfo(DEDENT, '', spos, '') + indents.pop() + + if (initial in numchars or # ordinary number + (initial == '.' and token != '.' and token != '...')): + yield TokenInfo(NUMBER, token, spos, prefix) + elif initial in '\r\n': + if not new_line and paren_level == 0: + yield TokenInfo(NEWLINE, token, spos, prefix) + else: + additional_prefix = prefix + token + new_line = True + elif initial == '#': # Comments + assert not token.endswith("\n") + additional_prefix = prefix + token + elif token in triple_quoted: + endprog = endpats[token] + endmatch = endprog.match(line, pos) + if endmatch: # all on one line + pos = endmatch.end(0) + token = line[start:pos] + yield TokenInfo(STRING, token, spos, prefix) + else: + contstr_start = (lnum, start) # multiple lines + contstr = line[start:] + contline = line + break + elif initial in single_quoted or \ + token[:2] in single_quoted or \ + token[:3] in single_quoted: + if token[-1] == '\n': # continued string + contstr_start = lnum, start + endprog = (endpats.get(initial) or endpats.get(token[1]) + or endpats.get(token[2])) + contstr = line[start:] + contline = line + break + else: # ordinary string + yield TokenInfo(STRING, token, spos, prefix) + elif is_identifier(initial): # ordinary name + if token in ALWAYS_BREAK_TOKENS: + paren_level = 0 + while True: + indent = indents.pop() + if indent > start: + yield TokenInfo(DEDENT, '', spos, '') + else: + indents.append(indent) + break + yield TokenInfo(NAME, token, spos, prefix) + elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n'): # continued stmt + additional_prefix += prefix + line[start:] + break + else: + if token in '([{': + paren_level += 1 + elif token in ')]}': + paren_level -= 1 + + try: + # This check is needed in any case to check if it's a valid + # operator or just some random unicode character. + exact_type = opmap[token] + except KeyError: + exact_type = typ = ERRORTOKEN + if use_exact_op_types: + typ = exact_type + else: + typ = OP + yield TokenInfo(typ, token, spos, prefix) + + if contstr: + yield TokenInfo(ERRORTOKEN, contstr, contstr_start, prefix) + if contstr.endswith('\n'): + new_line = True + + end_pos = lnum, max + # As the last position we just take the maximally possible position. We + # remove -1 for the last new line. + for indent in indents[1:]: + yield TokenInfo(DEDENT, '', end_pos, '') + yield TokenInfo(ENDMARKER, '', end_pos, additional_prefix) + + +if __name__ == "__main__": + import sys + if len(sys.argv) >= 2: + path = sys.argv[1] + with open(path) as f: + code = u(f.read()) + else: + code = u(sys.stdin.read()) + for token in source_tokens(code, use_exact_op_types=True): + print(token) diff --git a/parso/tree.py b/parso/tree.py new file mode 100644 index 0000000..bf1bdae --- /dev/null +++ b/parso/tree.py @@ -0,0 +1,328 @@ +from abc import abstractmethod, abstractproperty +from parso._compatibility import utf8_repr, encoding, is_py3 + + +def search_ancestor(node, *node_types): + """ + Recursively looks at the parents of a node and checks if the type names + match. + + :param node: The node that is looked at. + :param node_types: A tuple or a string of type names that are + searched for. + """ + while True: + node = node.parent + if node is None or node.type in node_types: + return node + + +class NodeOrLeaf(object): + """ + The base class for nodes and leaves. + """ + __slots__ = () + + def get_root_node(self): + """ + Returns the root node of a parser tree. The returned node doesn't have + a parent node like all the other nodes/leaves. + """ + scope = self + while scope.parent is not None: + scope = scope.parent + return scope + + def get_next_sibling(self): + """ + The node immediately following the invocant in their parent's children + list. If the invocant does not have a next sibling, it is None + """ + # Can't use index(); we need to test by identity + for i, child in enumerate(self.parent.children): + if child is self: + try: + return self.parent.children[i + 1] + except IndexError: + return None + + def get_previous_sibling(self): + """ + The node/leaf immediately preceding the invocant in their parent's + children list. If the invocant does not have a previous sibling, it is + None. + """ + # Can't use index(); we need to test by identity + for i, child in enumerate(self.parent.children): + if child is self: + if i == 0: + return None + return self.parent.children[i - 1] + + def get_previous_leaf(self): + """ + Returns the previous leaf in the parser tree. + Raises an IndexError if it's the first element in the parser tree. + """ + node = self + while True: + c = node.parent.children + i = c.index(node) + if i == 0: + node = node.parent + if node.parent is None: + return None + else: + node = c[i - 1] + break + + while True: + try: + node = node.children[-1] + except AttributeError: # A Leaf doesn't have children. + return node + + def get_next_leaf(self): + """ + Returns the next leaf in the parser tree. + Returns `None` if it's the last element in the parser tree. + """ + node = self + while True: + c = node.parent.children + i = c.index(node) + if i == len(c) - 1: + node = node.parent + if node.parent is None: + return None + else: + node = c[i + 1] + break + + while True: + try: + node = node.children[0] + except AttributeError: # A Leaf doesn't have children. + return node + + @abstractproperty + def start_pos(self): + """ + Returns the starting position of the prefix as a tuple, e.g. `(3, 4)`. + + :return tuple of int: (line, column) + """ + + @abstractproperty + def end_pos(self): + """ + Returns the end position of the prefix as a tuple, e.g. `(3, 4)`. + + :return tuple of int: (line, column) + """ + + @abstractmethod + def get_start_pos_of_prefix(self): + """ + Returns the start_pos of the prefix. This means basically it returns + the end_pos of the last prefix. The `get_start_pos_of_prefix()` of the + prefix `+` in `2 + 1` would be `(1, 1)`, while the start_pos is + `(1, 2)`. + + :return tuple of int: (line, column) + """ + + @abstractmethod + def get_first_leaf(self): + """ + Returns the first leaf of a node or itself it's a leaf. + """ + + @abstractmethod + def get_last_leaf(self): + """ + Returns the last leaf of a node or itself it's a leaf. + """ + + @abstractmethod + def get_code(self, normalized=False, include_prefix=True): + """ + Returns the code that was the input of the parser. + + If a normalizer is given, the returned code will be normalized and will + not be equal to the input. + + :param include_prefix: Removes the prefix (whitespace and comments) of e.g. a statement. + :param normalized: Deprecated. Please don't use. Will be replaced with something more powerful. + """ + + +class Leaf(NodeOrLeaf): + __slots__ = ('value', 'parent', 'line', 'indent', 'prefix') + + def __init__(self, value, start_pos, prefix=''): + self.value = value + self.start_pos = start_pos + self.prefix = prefix + self.parent = None + + @property + def start_pos(self): + return self.line, self.indent + + @start_pos.setter + def start_pos(self, value): + self.line = value[0] + self.indent = value[1] + + def get_start_pos_of_prefix(self): + previous_leaf = self.get_previous_leaf() + if previous_leaf is None: + return self.line - self.prefix.count('\n'), 0 # It's the first leaf. + return previous_leaf.end_pos + + def get_first_leaf(self): + return self + + def get_last_leaf(self): + return self + + def get_code(self, normalized=False, include_prefix=True): + if normalized: + return self.value + if include_prefix: + return self.prefix + self.value + else: + return self.value + + @property + def end_pos(self): + lines = self.value.split('\n') + end_pos_line = self.line + len(lines) - 1 + # Check for multiline token + if self.line == end_pos_line: + end_pos_indent = self.indent + len(lines[-1]) + else: + end_pos_indent = len(lines[-1]) + return end_pos_line, end_pos_indent + + @utf8_repr + def __repr__(self): + return "<%s: %s start=%s>" % (type(self).__name__, self.value, self.start_pos) + + +class BaseNode(NodeOrLeaf): + """ + The super class for all nodes. + + If you create custom nodes, you will probably want to inherit from this + ``BaseNode``. + """ + __slots__ = ('children', 'parent') + type = None + + def __init__(self, children): + for c in children: + c.parent = self + self.children = children + self.parent = None + + @property + def start_pos(self): + return self.children[0].start_pos + + def get_start_pos_of_prefix(self): + return self.children[0].get_start_pos_of_prefix() + + @property + def end_pos(self): + return self.children[-1].end_pos + + def _get_code_for_children(self, children, normalized, include_prefix): + # TODO implement normalized (depending on context). + if include_prefix: + return "".join(c.get_code(normalized) for c in children) + else: + first = children[0].get_code(include_prefix=False) + return first + "".join(c.get_code(normalized) for c in children[1:]) + + def get_code(self, normalized=False, include_prefix=True): + return self._get_code_for_children(self.children, normalized, include_prefix) + + def get_leaf_for_position(self, position, include_prefixes=False): + def binary_search(lower, upper): + if lower == upper: + element = self.children[lower] + if not include_prefixes and position < element.start_pos: + # We're on a prefix. + return None + # In case we have prefixes, a leaf always matches + try: + return element.get_leaf_for_position(position, include_prefixes) + except AttributeError: + return element + + + index = int((lower + upper) / 2) + element = self.children[index] + if position <= element.end_pos: + return binary_search(lower, index) + else: + return binary_search(index + 1, upper) + + if not ((1, 0) <= position <= self.children[-1].end_pos): + raise ValueError('Please provide a position that exists within this node.') + return binary_search(0, len(self.children) - 1) + + def get_first_leaf(self): + return self.children[0].get_first_leaf() + + def get_last_leaf(self): + return self.children[-1].get_last_leaf() + + @utf8_repr + def __repr__(self): + code = self.get_code().replace('\n', ' ').strip() + if not is_py3: + code = code.encode(encoding, 'replace') + return "<%s: %s@%s,%s>" % \ + (type(self).__name__, code, self.start_pos[0], self.start_pos[1]) + + +class Node(BaseNode): + """Concrete implementation for interior nodes.""" + __slots__ = ('type',) + + def __init__(self, type, children): + super(Node, self).__init__(children) + self.type = type + + def __repr__(self): + return "%s(%s, %r)" % (self.__class__.__name__, self.type, self.children) + + +class ErrorNode(BaseNode): + """ + A node that containes valid nodes/leaves that we're follow by a token that + was invalid. This basically means that the leaf after this node is where + Python would mark a syntax error. + """ + __slots__ = () + type = 'error_node' + + +class ErrorLeaf(Leaf): + """ + A leaf that is either completely invalid in a language (like `$` in Python) + or is invalid at that position. Like the star in `1 +* 1`. + """ + __slots__ = ('original_type') + type = 'error_leaf' + + def __init__(self, original_type, value, start_pos, prefix=''): + super(ErrorLeaf, self).__init__(value, start_pos, prefix) + self.original_type = original_type + + def __repr__(self): + return "<%s: %s:%s, %s)>" % \ + (type(self).__name__, self.original_type, repr(self.value), self.start_pos)