mirror of
https://github.com/davidhalter/jedi.git
synced 2025-12-07 22:44:27 +08:00
This also includes a rework for error recovery in the parser. This is now just possible for file_input parsing, which means for full files. Includes also a refactoring of the tokenizer. No more do we have to add an additional newline, because it now works correctly (removes certain confusion.
204 lines
7.9 KiB
Python
204 lines
7.9 KiB
Python
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
|
# Licensed to PSF under a Contributor Agreement.
|
|
|
|
# Modifications:
|
|
# Copyright 2014 David Halter. Integration into Jedi.
|
|
# Modifications are dual-licensed: MIT and PSF.
|
|
|
|
"""
|
|
Parser engine for the grammar tables generated by pgen.
|
|
|
|
The grammar table must be loaded first.
|
|
|
|
See Parser/parser.c in the Python distribution for additional info on
|
|
how this parsing engine works.
|
|
"""
|
|
|
|
# Local imports
|
|
from jedi.parser import tokenize
|
|
|
|
|
|
class ParseError(Exception):
|
|
"""Exception to signal the parser is stuck."""
|
|
|
|
def __init__(self, msg, type, value, start_pos):
|
|
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
|
|
(msg, tokenize.tok_name[type], value, start_pos))
|
|
self.msg = msg
|
|
self.type = type
|
|
self.value = value
|
|
self.start_pos = start_pos
|
|
|
|
|
|
class PgenParser(object):
|
|
"""Parser engine.
|
|
|
|
The proper usage sequence is:
|
|
|
|
p = Parser(grammar, [converter]) # create instance
|
|
p.setup([start]) # prepare for parsing
|
|
<for each input token>:
|
|
if p.addtoken(...): # parse a token; may raise ParseError
|
|
break
|
|
root = p.rootnode # root of abstract syntax tree
|
|
|
|
A Parser instance may be reused by calling setup() repeatedly.
|
|
|
|
A Parser instance contains state pertaining to the current token
|
|
sequence, and should not be used concurrently by different threads
|
|
to parse separate token sequences.
|
|
|
|
See driver.py for how to get input tokens by tokenizing a file or
|
|
string.
|
|
|
|
Parsing is complete when addtoken() returns True; the root of the
|
|
abstract syntax tree can then be retrieved from the rootnode
|
|
instance variable. When a syntax error occurs, addtoken() raises
|
|
the ParseError exception. There is no error recovery; the parser
|
|
cannot be used after a syntax error was reported (but it can be
|
|
reinitialized by calling setup()).
|
|
|
|
"""
|
|
|
|
def __init__(self, grammar, convert_node, convert_leaf, error_recovery, start):
|
|
"""Constructor.
|
|
|
|
The grammar argument is a grammar.Grammar instance; see the
|
|
grammar module for more information.
|
|
|
|
The parser is not ready yet for parsing; you must call the
|
|
setup() method to get it started.
|
|
|
|
The optional convert argument is a function mapping concrete
|
|
syntax tree nodes to abstract syntax tree nodes. If not
|
|
given, no conversion is done and the syntax tree produced is
|
|
the concrete syntax tree. If given, it must be a function of
|
|
two arguments, the first being the grammar (a grammar.Grammar
|
|
instance), and the second being the concrete syntax tree node
|
|
to be converted. The syntax tree is converted from the bottom
|
|
up.
|
|
|
|
A concrete syntax tree node is a (type, nodes) tuple, where
|
|
type is the node type (a token or symbol number) and nodes
|
|
is a list of children for symbols, and None for tokens.
|
|
|
|
An abstract syntax tree node may be anything; this is entirely
|
|
up to the converter function.
|
|
|
|
"""
|
|
self.grammar = grammar
|
|
self.convert_node = convert_node
|
|
self.convert_leaf = convert_leaf
|
|
|
|
# Each stack entry is a tuple: (dfa, state, node).
|
|
# A node is a tuple: (type, children),
|
|
# where children is a list of nodes or None
|
|
newnode = (start, [])
|
|
stackentry = (self.grammar.dfas[start], 0, newnode)
|
|
self.stack = [stackentry]
|
|
self.rootnode = None
|
|
self.error_recovery = error_recovery
|
|
|
|
def parse(self, tokenizer):
|
|
for type, value, prefix, start_pos in tokenizer:
|
|
if self.addtoken(type, value, prefix, start_pos):
|
|
break
|
|
else:
|
|
# We never broke out -- EOF is too soon -- Unfinished statement.
|
|
self.error_recovery(self.grammar, self.stack, type, value,
|
|
start_pos, prefix, self.addtoken)
|
|
# Add the ENDMARKER again.
|
|
if not self.addtoken(type, value, prefix, start_pos):
|
|
raise ParseError("incomplete input", type, value, start_pos)
|
|
return self.rootnode
|
|
|
|
def addtoken(self, type, value, prefix, start_pos):
|
|
"""Add a token; return True if this is the end of the program."""
|
|
# Map from token to label
|
|
if type == tokenize.NAME:
|
|
# Check for reserved words (keywords)
|
|
try:
|
|
ilabel = self.grammar.keywords[value]
|
|
except KeyError:
|
|
ilabel = self.grammar.tokens[type]
|
|
else:
|
|
ilabel = self.grammar.tokens[type]
|
|
|
|
# Loop until the token is shifted; may raise exceptions
|
|
while True:
|
|
dfa, state, node = self.stack[-1]
|
|
states, first = dfa
|
|
arcs = states[state]
|
|
# Look for a state with this label
|
|
for i, newstate in arcs:
|
|
t, v = self.grammar.labels[i]
|
|
if ilabel == i:
|
|
# Look it up in the list of labels
|
|
assert t < 256
|
|
# Shift a token; we're done with it
|
|
self.shift(type, value, newstate, prefix, start_pos)
|
|
# Pop while we are in an accept-only state
|
|
state = newstate
|
|
while states[state] == [(0, state)]:
|
|
self.pop()
|
|
if not self.stack:
|
|
# Done parsing!
|
|
return True
|
|
dfa, state, node = self.stack[-1]
|
|
states, first = dfa
|
|
# Done with this token
|
|
return False
|
|
elif t >= 256:
|
|
# See if it's a symbol and if we're in its first set
|
|
itsdfa = self.grammar.dfas[t]
|
|
itsstates, itsfirst = itsdfa
|
|
if ilabel in itsfirst:
|
|
# Push a symbol
|
|
self.push(t, itsdfa, newstate)
|
|
break # To continue the outer while loop
|
|
else:
|
|
if (0, state) in arcs:
|
|
# An accepting state, pop it and try something else
|
|
self.pop()
|
|
if not self.stack:
|
|
# Done parsing, but another token is input
|
|
raise ParseError("too much input", type, value, start_pos)
|
|
else:
|
|
self.error_recovery(self.grammar, self.stack, type,
|
|
value, start_pos, prefix, self.addtoken)
|
|
break
|
|
|
|
def shift(self, type, value, newstate, prefix, start_pos):
|
|
"""Shift a token. (Internal)"""
|
|
dfa, state, node = self.stack[-1]
|
|
newnode = self.convert_leaf(self.grammar, type, value, prefix, start_pos)
|
|
node[-1].append(newnode)
|
|
self.stack[-1] = (dfa, newstate, node)
|
|
|
|
def push(self, type, newdfa, newstate):
|
|
"""Push a nonterminal. (Internal)"""
|
|
dfa, state, node = self.stack[-1]
|
|
newnode = (type, [])
|
|
self.stack[-1] = (dfa, newstate, node)
|
|
self.stack.append((newdfa, 0, newnode))
|
|
|
|
def pop(self):
|
|
"""Pop a nonterminal. (Internal)"""
|
|
popdfa, popstate, (type, children) = self.stack.pop()
|
|
# If there's exactly one child, return that child instead of creating a
|
|
# new node. We still create expr_stmt and file_input though, because a
|
|
# lot of Jedi depends on its logic.
|
|
if len(children) == 1:
|
|
newnode = children[0]
|
|
else:
|
|
newnode = self.convert_node(self.grammar, type, children)
|
|
|
|
try:
|
|
# Equal to:
|
|
# dfa, state, node = self.stack[-1]
|
|
# symbol, children = node
|
|
self.stack[-1][2][1].append(newnode)
|
|
except IndexError:
|
|
# Stack is empty, set the rootnode.
|
|
self.rootnode = newnode
|