mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-11 07:01:59 +08:00
Merge the PgenParser and our own parser
This commit is contained in:
150
parso/parser.py
150
parso/parser.py
@@ -1,3 +1,11 @@
|
|||||||
|
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
||||||
|
# Licensed to PSF under a Contributor Agreement.
|
||||||
|
|
||||||
|
# Modifications:
|
||||||
|
# Copyright David Halter and Contributors
|
||||||
|
# Modifications are dual-licensed: MIT and PSF.
|
||||||
|
# 99% of the code is different from pgen2, now.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
The ``Parser`` tries to convert the available Python code in an easy to read
|
The ``Parser`` tries to convert the available Python code in an easy to read
|
||||||
format, something like an abstract syntax tree. The classes who represent this
|
format, something like an abstract syntax tree. The classes who represent this
|
||||||
@@ -16,7 +24,6 @@ complexity of the ``Parser`` (there's another parser sitting inside
|
|||||||
``Statement``, which produces ``Array`` and ``Call``).
|
``Statement``, which produces ``Array`` and ``Call``).
|
||||||
"""
|
"""
|
||||||
from parso import tree
|
from parso import tree
|
||||||
from parso.pgen2 import PgenParser
|
|
||||||
|
|
||||||
|
|
||||||
class ParserSyntaxError(Exception):
|
class ParserSyntaxError(Exception):
|
||||||
@@ -30,7 +37,81 @@ class ParserSyntaxError(Exception):
|
|||||||
self.error_leaf = error_leaf
|
self.error_leaf = error_leaf
|
||||||
|
|
||||||
|
|
||||||
|
class InternalParseError(Exception):
|
||||||
|
"""
|
||||||
|
Exception to signal the parser is stuck and error recovery didn't help.
|
||||||
|
Basically this shouldn't happen. It's a sign that something is really
|
||||||
|
wrong.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, msg, type_, value, start_pos):
|
||||||
|
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
|
||||||
|
(msg, type_.name, value, start_pos))
|
||||||
|
self.msg = msg
|
||||||
|
self.type = type
|
||||||
|
self.value = value
|
||||||
|
self.start_pos = start_pos
|
||||||
|
|
||||||
|
|
||||||
|
class Stack(list):
|
||||||
|
def get_tos_nodes(self):
|
||||||
|
tos = self[-1]
|
||||||
|
return tos[2][1]
|
||||||
|
|
||||||
|
def get_tos_first_tokens(self, grammar):
|
||||||
|
tos = self[-1]
|
||||||
|
inv_tokens = dict((v, k) for k, v in grammar.tokens.items())
|
||||||
|
inv_keywords = dict((v, k) for k, v in grammar.keywords.items())
|
||||||
|
dfa, state, nodes = tos
|
||||||
|
|
||||||
|
def check():
|
||||||
|
for first in dfa[1]:
|
||||||
|
try:
|
||||||
|
yield inv_keywords[first]
|
||||||
|
except KeyError:
|
||||||
|
yield tokenize.tok_name[inv_tokens[first]]
|
||||||
|
|
||||||
|
return sorted(check())
|
||||||
|
|
||||||
|
|
||||||
|
class StackNode(object):
|
||||||
|
def __init__(self, dfa):
|
||||||
|
self.dfa = dfa
|
||||||
|
self.nodes = []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nonterminal(self):
|
||||||
|
return self.dfa.from_rule
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
|
||||||
|
|
||||||
|
|
||||||
|
def _token_to_transition(grammar, type_, value):
|
||||||
|
# Map from token to label
|
||||||
|
if type_.contains_syntax:
|
||||||
|
# Check for reserved words (keywords)
|
||||||
|
try:
|
||||||
|
return grammar.reserved_syntax_strings[value]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return type_
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BaseParser(object):
|
class BaseParser(object):
|
||||||
|
"""Parser engine.
|
||||||
|
|
||||||
|
A Parser instance contains state pertaining to the current token
|
||||||
|
sequence, and should not be used concurrently by different threads
|
||||||
|
to parse separate token sequences.
|
||||||
|
|
||||||
|
See python/tokenize.py for how to get input tokens by a string.
|
||||||
|
|
||||||
|
When a syntax error occurs, error_recovery() is called.
|
||||||
|
"""
|
||||||
|
|
||||||
node_map = {}
|
node_map = {}
|
||||||
default_node = tree.Node
|
default_node = tree.Node
|
||||||
|
|
||||||
@@ -44,15 +125,21 @@ class BaseParser(object):
|
|||||||
self._error_recovery = error_recovery
|
self._error_recovery = error_recovery
|
||||||
|
|
||||||
def parse(self, tokens):
|
def parse(self, tokens):
|
||||||
self.pgen_parser = PgenParser(
|
first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0]
|
||||||
self._pgen_grammar, self.convert_node, self.convert_leaf,
|
self.stack = Stack([StackNode(first_dfa)])
|
||||||
self.error_recovery, self._start_nonterminal
|
|
||||||
)
|
|
||||||
|
|
||||||
node = self.pgen_parser.parse(tokens)
|
for type_, value, start_pos, prefix in tokens:
|
||||||
# The stack is empty now, we don't need it anymore.
|
self.add_token(type_, value, start_pos, prefix)
|
||||||
del self.pgen_parser
|
|
||||||
return node
|
while self.stack and self.stack[-1].dfa.is_final:
|
||||||
|
self._pop()
|
||||||
|
|
||||||
|
if self.stack:
|
||||||
|
# We never broke out -- EOF is too soon -- Unfinished statement.
|
||||||
|
# However, the error recovery might have added the token again, if
|
||||||
|
# the stack is empty, we're fine.
|
||||||
|
raise InternalParseError("incomplete input", type_, value, start_pos)
|
||||||
|
return self.rootnode
|
||||||
|
|
||||||
def error_recovery(self, pgen_grammar, stack, typ, value, start_pos, prefix,
|
def error_recovery(self, pgen_grammar, stack, typ, value, start_pos, prefix,
|
||||||
add_token_callback):
|
add_token_callback):
|
||||||
@@ -73,3 +160,48 @@ class BaseParser(object):
|
|||||||
return self.leaf_map[type_](value, start_pos, prefix)
|
return self.leaf_map[type_](value, start_pos, prefix)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return self.default_leaf(value, start_pos, prefix)
|
return self.default_leaf(value, start_pos, prefix)
|
||||||
|
|
||||||
|
def add_token(self, type_, value, start_pos, prefix):
|
||||||
|
"""Add a token; return True if this is the end of the program."""
|
||||||
|
grammar = self._pgen_grammar
|
||||||
|
stack = self.stack
|
||||||
|
transition = _token_to_transition(grammar, type_, value)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
plan = stack[-1].dfa.transition_to_plan[transition]
|
||||||
|
break
|
||||||
|
except KeyError:
|
||||||
|
if stack[-1].dfa.is_final:
|
||||||
|
self._pop()
|
||||||
|
else:
|
||||||
|
self.error_recovery(grammar, stack, type_,
|
||||||
|
value, start_pos, prefix, self.add_token)
|
||||||
|
return
|
||||||
|
except IndexError:
|
||||||
|
raise InternalParseError("too much input", type_, value, start_pos)
|
||||||
|
|
||||||
|
stack[-1].dfa = plan.next_dfa
|
||||||
|
|
||||||
|
for push in plan.dfa_pushes:
|
||||||
|
stack.append(StackNode(push))
|
||||||
|
|
||||||
|
leaf = self.convert_leaf(grammar, type_, value, prefix, start_pos)
|
||||||
|
stack[-1].nodes.append(leaf)
|
||||||
|
|
||||||
|
def _pop(self):
|
||||||
|
tos = self.stack.pop()
|
||||||
|
# If there's exactly one child, return that child instead of
|
||||||
|
# creating a new node. We still create expr_stmt and
|
||||||
|
# file_input though, because a lot of Jedi depends on its
|
||||||
|
# logic.
|
||||||
|
if len(tos.nodes) == 1:
|
||||||
|
new_node = tos.nodes[0]
|
||||||
|
else:
|
||||||
|
new_node = self.convert_node(self._pgen_grammar, tos.dfa.from_rule, tos.nodes)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.stack[-1].nodes.append(new_node)
|
||||||
|
except IndexError:
|
||||||
|
# Stack is empty, set the rootnode.
|
||||||
|
self.rootnode = new_node
|
||||||
|
|||||||
@@ -8,4 +8,3 @@
|
|||||||
# Modifications are dual-licensed: MIT and PSF.
|
# Modifications are dual-licensed: MIT and PSF.
|
||||||
|
|
||||||
from parso.pgen2.generator import generate_grammar
|
from parso.pgen2.generator import generate_grammar
|
||||||
from parso.pgen2.parse import PgenParser
|
|
||||||
|
|||||||
@@ -1,201 +0,0 @@
|
|||||||
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
|
||||||
# Licensed to PSF under a Contributor Agreement.
|
|
||||||
|
|
||||||
# Modifications:
|
|
||||||
# Copyright David Halter and Contributors
|
|
||||||
# Modifications are dual-licensed: MIT and PSF.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Parser engine for the grammar tables generated by pgen.
|
|
||||||
|
|
||||||
The grammar table must be loaded first.
|
|
||||||
|
|
||||||
See Parser/parser.c in the Python distribution for additional info on
|
|
||||||
how this parsing engine works.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class InternalParseError(Exception):
|
|
||||||
"""
|
|
||||||
Exception to signal the parser is stuck and error recovery didn't help.
|
|
||||||
Basically this shouldn't happen. It's a sign that something is really
|
|
||||||
wrong.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, msg, type_, value, start_pos):
|
|
||||||
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
|
|
||||||
(msg, type_.name, value, start_pos))
|
|
||||||
self.msg = msg
|
|
||||||
self.type = type
|
|
||||||
self.value = value
|
|
||||||
self.start_pos = start_pos
|
|
||||||
|
|
||||||
|
|
||||||
class Stack(list):
|
|
||||||
def get_tos_nodes(self):
|
|
||||||
tos = self[-1]
|
|
||||||
return tos[2][1]
|
|
||||||
|
|
||||||
def get_tos_first_tokens(self, grammar):
|
|
||||||
tos = self[-1]
|
|
||||||
inv_tokens = dict((v, k) for k, v in grammar.tokens.items())
|
|
||||||
inv_keywords = dict((v, k) for k, v in grammar.keywords.items())
|
|
||||||
dfa, state, nodes = tos
|
|
||||||
|
|
||||||
def check():
|
|
||||||
for first in dfa[1]:
|
|
||||||
try:
|
|
||||||
yield inv_keywords[first]
|
|
||||||
except KeyError:
|
|
||||||
yield tokenize.tok_name[inv_tokens[first]]
|
|
||||||
|
|
||||||
return sorted(check())
|
|
||||||
|
|
||||||
|
|
||||||
class StackNode(object):
|
|
||||||
def __init__(self, dfa):
|
|
||||||
self.dfa = dfa
|
|
||||||
self.nodes = []
|
|
||||||
|
|
||||||
@property
|
|
||||||
def nonterminal(self):
|
|
||||||
return self.dfa.from_rule
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
|
|
||||||
|
|
||||||
|
|
||||||
def _token_to_transition(grammar, type_, value):
|
|
||||||
# Map from token to label
|
|
||||||
if type_.contains_syntax:
|
|
||||||
# Check for reserved words (keywords)
|
|
||||||
try:
|
|
||||||
return grammar.reserved_syntax_strings[value]
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return type_
|
|
||||||
|
|
||||||
|
|
||||||
class PgenParser(object):
|
|
||||||
"""Parser engine.
|
|
||||||
|
|
||||||
The proper usage sequence is:
|
|
||||||
|
|
||||||
p = Parser(grammar, [converter]) # create instance
|
|
||||||
p.setup([start]) # prepare for parsing
|
|
||||||
<for each input token>:
|
|
||||||
if p.add_token(...): # parse a token
|
|
||||||
break
|
|
||||||
root = p.rootnode # root of abstract syntax tree
|
|
||||||
|
|
||||||
A Parser instance may be reused by calling setup() repeatedly.
|
|
||||||
|
|
||||||
A Parser instance contains state pertaining to the current token
|
|
||||||
sequence, and should not be used concurrently by different threads
|
|
||||||
to parse separate token sequences.
|
|
||||||
|
|
||||||
See driver.py for how to get input tokens by tokenizing a file or
|
|
||||||
string.
|
|
||||||
|
|
||||||
Parsing is complete when add_token() returns True; the root of the
|
|
||||||
abstract syntax tree can then be retrieved from the rootnode
|
|
||||||
instance variable. When a syntax error occurs, error_recovery()
|
|
||||||
is called. There is no error recovery; the parser cannot be used
|
|
||||||
after a syntax error was reported (but it can be reinitialized by
|
|
||||||
calling setup()).
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, grammar, convert_node, convert_leaf, error_recovery,
|
|
||||||
start_nonterminal):
|
|
||||||
"""Constructor.
|
|
||||||
|
|
||||||
The grammar argument is a grammar.Grammar instance; see the
|
|
||||||
grammar module for more information.
|
|
||||||
|
|
||||||
The parser is not ready yet for parsing; you must call the
|
|
||||||
setup() method to get it started.
|
|
||||||
|
|
||||||
The optional convert argument is a function mapping concrete
|
|
||||||
syntax tree nodes to abstract syntax tree nodes. If not
|
|
||||||
given, no conversion is done and the syntax tree produced is
|
|
||||||
the concrete syntax tree. If given, it must be a function of
|
|
||||||
two arguments, the first being the grammar (a grammar.Grammar
|
|
||||||
instance), and the second being the concrete syntax tree node
|
|
||||||
to be converted. The syntax tree is converted from the bottom
|
|
||||||
up.
|
|
||||||
|
|
||||||
A concrete syntax tree node is a (type, nodes) tuple, where
|
|
||||||
type is the node type (a token or nonterminal number) and nodes
|
|
||||||
is a list of children for nonterminals, and None for tokens.
|
|
||||||
|
|
||||||
An abstract syntax tree node may be anything; this is entirely
|
|
||||||
up to the converter function.
|
|
||||||
|
|
||||||
"""
|
|
||||||
self.grammar = grammar
|
|
||||||
self.convert_node = convert_node
|
|
||||||
self.convert_leaf = convert_leaf
|
|
||||||
|
|
||||||
self.stack = Stack([StackNode(grammar.nonterminal_to_dfas[start_nonterminal][0])])
|
|
||||||
self.error_recovery = error_recovery
|
|
||||||
|
|
||||||
def parse(self, tokens):
|
|
||||||
for type_, value, start_pos, prefix in tokens:
|
|
||||||
self.add_token(type_, value, start_pos, prefix)
|
|
||||||
|
|
||||||
while self.stack and self.stack[-1].dfa.is_final:
|
|
||||||
self._pop()
|
|
||||||
|
|
||||||
if self.stack:
|
|
||||||
# We never broke out -- EOF is too soon -- Unfinished statement.
|
|
||||||
# However, the error recovery might have added the token again, if
|
|
||||||
# the stack is empty, we're fine.
|
|
||||||
raise InternalParseError("incomplete input", type_, value, start_pos)
|
|
||||||
return self.rootnode
|
|
||||||
|
|
||||||
def add_token(self, type_, value, start_pos, prefix):
|
|
||||||
"""Add a token; return True if this is the end of the program."""
|
|
||||||
transition = _token_to_transition(self.grammar, type_, value)
|
|
||||||
stack = self.stack
|
|
||||||
grammar = self.grammar
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
plan = stack[-1].dfa.transition_to_plan[transition]
|
|
||||||
break
|
|
||||||
except KeyError:
|
|
||||||
if stack[-1].dfa.is_final:
|
|
||||||
self._pop()
|
|
||||||
else:
|
|
||||||
self.error_recovery(grammar, stack, type_,
|
|
||||||
value, start_pos, prefix, self.add_token)
|
|
||||||
return
|
|
||||||
except IndexError:
|
|
||||||
raise InternalParseError("too much input", type_, value, start_pos)
|
|
||||||
|
|
||||||
stack[-1].dfa = plan.next_dfa
|
|
||||||
|
|
||||||
for push in plan.dfa_pushes:
|
|
||||||
stack.append(StackNode(push))
|
|
||||||
|
|
||||||
leaf = self.convert_leaf(grammar, type_, value, prefix, start_pos)
|
|
||||||
stack[-1].nodes.append(leaf)
|
|
||||||
|
|
||||||
def _pop(self):
|
|
||||||
tos = self.stack.pop()
|
|
||||||
# If there's exactly one child, return that child instead of
|
|
||||||
# creating a new node. We still create expr_stmt and
|
|
||||||
# file_input though, because a lot of Jedi depends on its
|
|
||||||
# logic.
|
|
||||||
if len(tos.nodes) == 1:
|
|
||||||
new_node = tos.nodes[0]
|
|
||||||
else:
|
|
||||||
new_node = self.convert_node(self.grammar, tos.dfa.from_rule, tos.nodes)
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.stack[-1].nodes.append(new_node)
|
|
||||||
except IndexError:
|
|
||||||
# Stack is empty, set the rootnode.
|
|
||||||
self.rootnode = new_node
|
|
||||||
@@ -287,7 +287,7 @@ class DiffParser(object):
|
|||||||
omitted_first_indent = False
|
omitted_first_indent = False
|
||||||
indents = []
|
indents = []
|
||||||
tokens = self._tokenizer(lines, (1, 0))
|
tokens = self._tokenizer(lines, (1, 0))
|
||||||
stack = self._active_parser.pgen_parser.stack
|
stack = self._active_parser.stack
|
||||||
for typ, string, start_pos, prefix in tokens:
|
for typ, string, start_pos, prefix in tokens:
|
||||||
start_pos = start_pos[0] + line_offset, start_pos[1]
|
start_pos = start_pos[0] + line_offset, start_pos[1]
|
||||||
if typ == PythonTokenTypes.INDENT:
|
if typ == PythonTokenTypes.INDENT:
|
||||||
|
|||||||
Reference in New Issue
Block a user