mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-11 07:01:59 +08:00
Merge the PgenParser and our own parser
This commit is contained in:
150
parso/parser.py
150
parso/parser.py
@@ -1,3 +1,11 @@
|
||||
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
# Modifications:
|
||||
# Copyright David Halter and Contributors
|
||||
# Modifications are dual-licensed: MIT and PSF.
|
||||
# 99% of the code is different from pgen2, now.
|
||||
|
||||
"""
|
||||
The ``Parser`` tries to convert the available Python code in an easy to read
|
||||
format, something like an abstract syntax tree. The classes who represent this
|
||||
@@ -16,7 +24,6 @@ complexity of the ``Parser`` (there's another parser sitting inside
|
||||
``Statement``, which produces ``Array`` and ``Call``).
|
||||
"""
|
||||
from parso import tree
|
||||
from parso.pgen2 import PgenParser
|
||||
|
||||
|
||||
class ParserSyntaxError(Exception):
|
||||
@@ -30,7 +37,81 @@ class ParserSyntaxError(Exception):
|
||||
self.error_leaf = error_leaf
|
||||
|
||||
|
||||
class InternalParseError(Exception):
|
||||
"""
|
||||
Exception to signal the parser is stuck and error recovery didn't help.
|
||||
Basically this shouldn't happen. It's a sign that something is really
|
||||
wrong.
|
||||
"""
|
||||
|
||||
def __init__(self, msg, type_, value, start_pos):
|
||||
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
|
||||
(msg, type_.name, value, start_pos))
|
||||
self.msg = msg
|
||||
self.type = type
|
||||
self.value = value
|
||||
self.start_pos = start_pos
|
||||
|
||||
|
||||
class Stack(list):
|
||||
def get_tos_nodes(self):
|
||||
tos = self[-1]
|
||||
return tos[2][1]
|
||||
|
||||
def get_tos_first_tokens(self, grammar):
|
||||
tos = self[-1]
|
||||
inv_tokens = dict((v, k) for k, v in grammar.tokens.items())
|
||||
inv_keywords = dict((v, k) for k, v in grammar.keywords.items())
|
||||
dfa, state, nodes = tos
|
||||
|
||||
def check():
|
||||
for first in dfa[1]:
|
||||
try:
|
||||
yield inv_keywords[first]
|
||||
except KeyError:
|
||||
yield tokenize.tok_name[inv_tokens[first]]
|
||||
|
||||
return sorted(check())
|
||||
|
||||
|
||||
class StackNode(object):
|
||||
def __init__(self, dfa):
|
||||
self.dfa = dfa
|
||||
self.nodes = []
|
||||
|
||||
@property
|
||||
def nonterminal(self):
|
||||
return self.dfa.from_rule
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
|
||||
|
||||
|
||||
def _token_to_transition(grammar, type_, value):
|
||||
# Map from token to label
|
||||
if type_.contains_syntax:
|
||||
# Check for reserved words (keywords)
|
||||
try:
|
||||
return grammar.reserved_syntax_strings[value]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return type_
|
||||
|
||||
|
||||
|
||||
class BaseParser(object):
|
||||
"""Parser engine.
|
||||
|
||||
A Parser instance contains state pertaining to the current token
|
||||
sequence, and should not be used concurrently by different threads
|
||||
to parse separate token sequences.
|
||||
|
||||
See python/tokenize.py for how to get input tokens by a string.
|
||||
|
||||
When a syntax error occurs, error_recovery() is called.
|
||||
"""
|
||||
|
||||
node_map = {}
|
||||
default_node = tree.Node
|
||||
|
||||
@@ -44,15 +125,21 @@ class BaseParser(object):
|
||||
self._error_recovery = error_recovery
|
||||
|
||||
def parse(self, tokens):
|
||||
self.pgen_parser = PgenParser(
|
||||
self._pgen_grammar, self.convert_node, self.convert_leaf,
|
||||
self.error_recovery, self._start_nonterminal
|
||||
)
|
||||
first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0]
|
||||
self.stack = Stack([StackNode(first_dfa)])
|
||||
|
||||
node = self.pgen_parser.parse(tokens)
|
||||
# The stack is empty now, we don't need it anymore.
|
||||
del self.pgen_parser
|
||||
return node
|
||||
for type_, value, start_pos, prefix in tokens:
|
||||
self.add_token(type_, value, start_pos, prefix)
|
||||
|
||||
while self.stack and self.stack[-1].dfa.is_final:
|
||||
self._pop()
|
||||
|
||||
if self.stack:
|
||||
# We never broke out -- EOF is too soon -- Unfinished statement.
|
||||
# However, the error recovery might have added the token again, if
|
||||
# the stack is empty, we're fine.
|
||||
raise InternalParseError("incomplete input", type_, value, start_pos)
|
||||
return self.rootnode
|
||||
|
||||
def error_recovery(self, pgen_grammar, stack, typ, value, start_pos, prefix,
|
||||
add_token_callback):
|
||||
@@ -73,3 +160,48 @@ class BaseParser(object):
|
||||
return self.leaf_map[type_](value, start_pos, prefix)
|
||||
except KeyError:
|
||||
return self.default_leaf(value, start_pos, prefix)
|
||||
|
||||
def add_token(self, type_, value, start_pos, prefix):
|
||||
"""Add a token; return True if this is the end of the program."""
|
||||
grammar = self._pgen_grammar
|
||||
stack = self.stack
|
||||
transition = _token_to_transition(grammar, type_, value)
|
||||
|
||||
while True:
|
||||
try:
|
||||
plan = stack[-1].dfa.transition_to_plan[transition]
|
||||
break
|
||||
except KeyError:
|
||||
if stack[-1].dfa.is_final:
|
||||
self._pop()
|
||||
else:
|
||||
self.error_recovery(grammar, stack, type_,
|
||||
value, start_pos, prefix, self.add_token)
|
||||
return
|
||||
except IndexError:
|
||||
raise InternalParseError("too much input", type_, value, start_pos)
|
||||
|
||||
stack[-1].dfa = plan.next_dfa
|
||||
|
||||
for push in plan.dfa_pushes:
|
||||
stack.append(StackNode(push))
|
||||
|
||||
leaf = self.convert_leaf(grammar, type_, value, prefix, start_pos)
|
||||
stack[-1].nodes.append(leaf)
|
||||
|
||||
def _pop(self):
|
||||
tos = self.stack.pop()
|
||||
# If there's exactly one child, return that child instead of
|
||||
# creating a new node. We still create expr_stmt and
|
||||
# file_input though, because a lot of Jedi depends on its
|
||||
# logic.
|
||||
if len(tos.nodes) == 1:
|
||||
new_node = tos.nodes[0]
|
||||
else:
|
||||
new_node = self.convert_node(self._pgen_grammar, tos.dfa.from_rule, tos.nodes)
|
||||
|
||||
try:
|
||||
self.stack[-1].nodes.append(new_node)
|
||||
except IndexError:
|
||||
# Stack is empty, set the rootnode.
|
||||
self.rootnode = new_node
|
||||
|
||||
@@ -8,4 +8,3 @@
|
||||
# Modifications are dual-licensed: MIT and PSF.
|
||||
|
||||
from parso.pgen2.generator import generate_grammar
|
||||
from parso.pgen2.parse import PgenParser
|
||||
|
||||
@@ -1,201 +0,0 @@
|
||||
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
# Modifications:
|
||||
# Copyright David Halter and Contributors
|
||||
# Modifications are dual-licensed: MIT and PSF.
|
||||
|
||||
"""
|
||||
Parser engine for the grammar tables generated by pgen.
|
||||
|
||||
The grammar table must be loaded first.
|
||||
|
||||
See Parser/parser.c in the Python distribution for additional info on
|
||||
how this parsing engine works.
|
||||
"""
|
||||
|
||||
|
||||
class InternalParseError(Exception):
|
||||
"""
|
||||
Exception to signal the parser is stuck and error recovery didn't help.
|
||||
Basically this shouldn't happen. It's a sign that something is really
|
||||
wrong.
|
||||
"""
|
||||
|
||||
def __init__(self, msg, type_, value, start_pos):
|
||||
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
|
||||
(msg, type_.name, value, start_pos))
|
||||
self.msg = msg
|
||||
self.type = type
|
||||
self.value = value
|
||||
self.start_pos = start_pos
|
||||
|
||||
|
||||
class Stack(list):
|
||||
def get_tos_nodes(self):
|
||||
tos = self[-1]
|
||||
return tos[2][1]
|
||||
|
||||
def get_tos_first_tokens(self, grammar):
|
||||
tos = self[-1]
|
||||
inv_tokens = dict((v, k) for k, v in grammar.tokens.items())
|
||||
inv_keywords = dict((v, k) for k, v in grammar.keywords.items())
|
||||
dfa, state, nodes = tos
|
||||
|
||||
def check():
|
||||
for first in dfa[1]:
|
||||
try:
|
||||
yield inv_keywords[first]
|
||||
except KeyError:
|
||||
yield tokenize.tok_name[inv_tokens[first]]
|
||||
|
||||
return sorted(check())
|
||||
|
||||
|
||||
class StackNode(object):
|
||||
def __init__(self, dfa):
|
||||
self.dfa = dfa
|
||||
self.nodes = []
|
||||
|
||||
@property
|
||||
def nonterminal(self):
|
||||
return self.dfa.from_rule
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
|
||||
|
||||
|
||||
def _token_to_transition(grammar, type_, value):
|
||||
# Map from token to label
|
||||
if type_.contains_syntax:
|
||||
# Check for reserved words (keywords)
|
||||
try:
|
||||
return grammar.reserved_syntax_strings[value]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return type_
|
||||
|
||||
|
||||
class PgenParser(object):
|
||||
"""Parser engine.
|
||||
|
||||
The proper usage sequence is:
|
||||
|
||||
p = Parser(grammar, [converter]) # create instance
|
||||
p.setup([start]) # prepare for parsing
|
||||
<for each input token>:
|
||||
if p.add_token(...): # parse a token
|
||||
break
|
||||
root = p.rootnode # root of abstract syntax tree
|
||||
|
||||
A Parser instance may be reused by calling setup() repeatedly.
|
||||
|
||||
A Parser instance contains state pertaining to the current token
|
||||
sequence, and should not be used concurrently by different threads
|
||||
to parse separate token sequences.
|
||||
|
||||
See driver.py for how to get input tokens by tokenizing a file or
|
||||
string.
|
||||
|
||||
Parsing is complete when add_token() returns True; the root of the
|
||||
abstract syntax tree can then be retrieved from the rootnode
|
||||
instance variable. When a syntax error occurs, error_recovery()
|
||||
is called. There is no error recovery; the parser cannot be used
|
||||
after a syntax error was reported (but it can be reinitialized by
|
||||
calling setup()).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, convert_node, convert_leaf, error_recovery,
|
||||
start_nonterminal):
|
||||
"""Constructor.
|
||||
|
||||
The grammar argument is a grammar.Grammar instance; see the
|
||||
grammar module for more information.
|
||||
|
||||
The parser is not ready yet for parsing; you must call the
|
||||
setup() method to get it started.
|
||||
|
||||
The optional convert argument is a function mapping concrete
|
||||
syntax tree nodes to abstract syntax tree nodes. If not
|
||||
given, no conversion is done and the syntax tree produced is
|
||||
the concrete syntax tree. If given, it must be a function of
|
||||
two arguments, the first being the grammar (a grammar.Grammar
|
||||
instance), and the second being the concrete syntax tree node
|
||||
to be converted. The syntax tree is converted from the bottom
|
||||
up.
|
||||
|
||||
A concrete syntax tree node is a (type, nodes) tuple, where
|
||||
type is the node type (a token or nonterminal number) and nodes
|
||||
is a list of children for nonterminals, and None for tokens.
|
||||
|
||||
An abstract syntax tree node may be anything; this is entirely
|
||||
up to the converter function.
|
||||
|
||||
"""
|
||||
self.grammar = grammar
|
||||
self.convert_node = convert_node
|
||||
self.convert_leaf = convert_leaf
|
||||
|
||||
self.stack = Stack([StackNode(grammar.nonterminal_to_dfas[start_nonterminal][0])])
|
||||
self.error_recovery = error_recovery
|
||||
|
||||
def parse(self, tokens):
|
||||
for type_, value, start_pos, prefix in tokens:
|
||||
self.add_token(type_, value, start_pos, prefix)
|
||||
|
||||
while self.stack and self.stack[-1].dfa.is_final:
|
||||
self._pop()
|
||||
|
||||
if self.stack:
|
||||
# We never broke out -- EOF is too soon -- Unfinished statement.
|
||||
# However, the error recovery might have added the token again, if
|
||||
# the stack is empty, we're fine.
|
||||
raise InternalParseError("incomplete input", type_, value, start_pos)
|
||||
return self.rootnode
|
||||
|
||||
def add_token(self, type_, value, start_pos, prefix):
|
||||
"""Add a token; return True if this is the end of the program."""
|
||||
transition = _token_to_transition(self.grammar, type_, value)
|
||||
stack = self.stack
|
||||
grammar = self.grammar
|
||||
|
||||
while True:
|
||||
try:
|
||||
plan = stack[-1].dfa.transition_to_plan[transition]
|
||||
break
|
||||
except KeyError:
|
||||
if stack[-1].dfa.is_final:
|
||||
self._pop()
|
||||
else:
|
||||
self.error_recovery(grammar, stack, type_,
|
||||
value, start_pos, prefix, self.add_token)
|
||||
return
|
||||
except IndexError:
|
||||
raise InternalParseError("too much input", type_, value, start_pos)
|
||||
|
||||
stack[-1].dfa = plan.next_dfa
|
||||
|
||||
for push in plan.dfa_pushes:
|
||||
stack.append(StackNode(push))
|
||||
|
||||
leaf = self.convert_leaf(grammar, type_, value, prefix, start_pos)
|
||||
stack[-1].nodes.append(leaf)
|
||||
|
||||
def _pop(self):
|
||||
tos = self.stack.pop()
|
||||
# If there's exactly one child, return that child instead of
|
||||
# creating a new node. We still create expr_stmt and
|
||||
# file_input though, because a lot of Jedi depends on its
|
||||
# logic.
|
||||
if len(tos.nodes) == 1:
|
||||
new_node = tos.nodes[0]
|
||||
else:
|
||||
new_node = self.convert_node(self.grammar, tos.dfa.from_rule, tos.nodes)
|
||||
|
||||
try:
|
||||
self.stack[-1].nodes.append(new_node)
|
||||
except IndexError:
|
||||
# Stack is empty, set the rootnode.
|
||||
self.rootnode = new_node
|
||||
@@ -287,7 +287,7 @@ class DiffParser(object):
|
||||
omitted_first_indent = False
|
||||
indents = []
|
||||
tokens = self._tokenizer(lines, (1, 0))
|
||||
stack = self._active_parser.pgen_parser.stack
|
||||
stack = self._active_parser.stack
|
||||
for typ, string, start_pos, prefix in tokens:
|
||||
start_pos = start_pos[0] + line_offset, start_pos[1]
|
||||
if typ == PythonTokenTypes.INDENT:
|
||||
|
||||
Reference in New Issue
Block a user