1
0
forked from VimPlug/jedi
Files
jedi-fork/jedi/parser/pgen2/parse.py
2014-11-28 21:33:40 +01:00

223 lines
8.7 KiB
Python

# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""
Parser engine for the grammar tables generated by pgen.
The grammar table must be loaded first.
See Parser/parser.c in the Python distribution for additional info on
how this parsing engine works.
"""
# Local imports
from . import token
class ParseError(Exception):
"""Exception to signal the parser is stuck."""
def __init__(self, msg, type, value, start_pos):
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
(msg, type, value, start_pos))
self.msg = msg
self.type = type
self.value = value
self.start_pos = start_pos
class Parser(object):
"""Parser engine.
The proper usage sequence is:
p = Parser(grammar, [converter]) # create instance
p.setup([start]) # prepare for parsing
<for each input token>:
if p.addtoken(...): # parse a token; may raise ParseError
break
root = p.rootnode # root of abstract syntax tree
A Parser instance may be reused by calling setup() repeatedly.
A Parser instance contains state pertaining to the current token
sequence, and should not be used concurrently by different threads
to parse separate token sequences.
See driver.py for how to get input tokens by tokenizing a file or
string.
Parsing is complete when addtoken() returns True; the root of the
abstract syntax tree can then be retrieved from the rootnode
instance variable. When a syntax error occurs, addtoken() raises
the ParseError exception. There is no error recovery; the parser
cannot be used after a syntax error was reported (but it can be
reinitialized by calling setup()).
"""
def __init__(self, grammar, convert_node, convert_leaf, error_recovery):
"""Constructor.
The grammar argument is a grammar.Grammar instance; see the
grammar module for more information.
The parser is not ready yet for parsing; you must call the
setup() method to get it started.
The optional convert argument is a function mapping concrete
syntax tree nodes to abstract syntax tree nodes. If not
given, no conversion is done and the syntax tree produced is
the concrete syntax tree. If given, it must be a function of
two arguments, the first being the grammar (a grammar.Grammar
instance), and the second being the concrete syntax tree node
to be converted. The syntax tree is converted from the bottom
up.
A concrete syntax tree node is a (type, nodes) tuple, where
type is the node type (a token or symbol number) and nodes
is a list of children for symbols, and None for tokens.
An abstract syntax tree node may be anything; this is entirely
up to the converter function.
"""
self.grammar = grammar
self.convert_node = convert_node
self.convert_leaf = convert_leaf
# Prepare for parsing.
start = self.grammar.start
# Each stack entry is a tuple: (dfa, state, node).
# A node is a tuple: (type, children),
# where children is a list of nodes or None
newnode = (start, [])
stackentry = (self.grammar.dfas[start], 0, newnode)
self.stack = [stackentry]
self.rootnode = None
self.error_recovery = error_recovery
def tokenize(self, tokenizer):
"""
This is not a real tokenizer, but it adds indents. You could hand the
parse function a normal tokenizer (e.g. the lib2to3 one). But if we use
the parser stack we are able to do error recovery from wrong indents.
"""
for type, value, prefix, start_pos in tokenizer:
#print(token.tok_name[type], value)
yield type, value, prefix, start_pos
def parse(self, tokenizer):
for type, value, prefix, start_pos in tokenizer:
if self.addtoken(type, value, prefix, start_pos):
break
else:
# We never broke out -- EOF is too soon -- Unfinished statement.
self.error_recovery(self.grammar, self.stack, type, value,
start_pos, prefix, self.addtoken)
# Add the ENDMARKER again.
if not self.addtoken(type, value, prefix, start_pos):
raise ParseError("incomplete input", type, value, start_pos)
return self.rootnode
def addtoken(self, type, value, prefix, start_pos):
"""Add a token; return True iff this is the end of the program."""
# Map from token to label
try:
ilabel = self.classify(type, value, start_pos)
except ParseError:
# Currently we ignore tokens like `?`.
print('invalid token', token.tok_name[type], value)
return
# Loop until the token is shifted; may raise exceptions
while True:
dfa, state, node = self.stack[-1]
states, first = dfa
arcs = states[state]
# Look for a state with this label
for i, newstate in arcs:
t, v = self.grammar.labels[i]
if ilabel == i:
# Look it up in the list of labels
assert t < 256
# Shift a token; we're done with it
self.shift(type, value, newstate, prefix, start_pos)
# Pop while we are in an accept-only state
state = newstate
while states[state] == [(0, state)]:
self.pop()
if not self.stack:
# Done parsing!
return True
dfa, state, node = self.stack[-1]
states, first = dfa
# Done with this token
return False
elif t >= 256:
# See if it's a symbol and if we're in its first set
itsdfa = self.grammar.dfas[t]
itsstates, itsfirst = itsdfa
if ilabel in itsfirst:
# Push a symbol
self.push(t, itsdfa, newstate)
break # To continue the outer while loop
else:
if (0, state) in arcs:
# An accepting state, pop it and try something else
self.pop()
if not self.stack:
# Done parsing, but another token is input
raise ParseError("too much input", type, value, start_pos)
else:
self.error_recovery(self.grammar, self.stack, type,
value, start_pos, prefix, self.addtoken)
break
def classify(self, type, value, start_pos):
"""Turn a token into a label. (Internal)"""
if type == token.NAME:
# Check for reserved words
ilabel = self.grammar.keywords.get(value)
if ilabel is not None:
return ilabel
ilabel = self.grammar.tokens.get(type)
if ilabel is None:
raise ParseError("bad token", type, value, start_pos)
return ilabel
def shift(self, type, value, newstate, prefix, start_pos):
"""Shift a token. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = self.convert_leaf(self.grammar, type, value, prefix, start_pos)
node[-1].append(newnode)
self.stack[-1] = (dfa, newstate, node)
def push(self, type, newdfa, newstate):
"""Push a nonterminal. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = (type, [])
self.stack[-1] = (dfa, newstate, node)
self.stack.append((newdfa, 0, newnode))
def pop(self):
"""Pop a nonterminal. (Internal)"""
popdfa, popstate, (type, children) = self.stack.pop()
# If there's exactly one child, return that child instead of creating a
# new node. We still create expr_stmt and file_input though, because a
# lot of Jedi depends on its logic.
if len(children) != 1 or type in (self.grammar.symbol2number['expr_stmt'],
self.grammar.symbol2number['file_input']):
newnode = self.convert_node(self.grammar, type, children)
else:
newnode = children[0]
try:
# Equal to:
# dfa, state, node = self.stack[-1]
# symbol, children = node
self.stack[-1][2][1].append(newnode)
except IndexError:
# stack is empty, set the rootnode.
self.rootnode = newnode