Merge the PgenParser and our own parser

This commit is contained in:
Dave Halter
2018-06-27 23:45:04 +02:00
parent edce279dee
commit f7d3d4e82f
4 changed files with 142 additions and 212 deletions

View File

@@ -1,3 +1,11 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
# 99% of the code is different from pgen2, now.
"""
The ``Parser`` tries to convert the available Python code in an easy to read
format, something like an abstract syntax tree. The classes who represent this
@@ -16,7 +24,6 @@ complexity of the ``Parser`` (there's another parser sitting inside
``Statement``, which produces ``Array`` and ``Call``).
"""
from parso import tree
from parso.pgen2 import PgenParser
class ParserSyntaxError(Exception):
@@ -30,7 +37,81 @@ class ParserSyntaxError(Exception):
self.error_leaf = error_leaf
class InternalParseError(Exception):
"""
Exception to signal the parser is stuck and error recovery didn't help.
Basically this shouldn't happen. It's a sign that something is really
wrong.
"""
def __init__(self, msg, type_, value, start_pos):
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
(msg, type_.name, value, start_pos))
self.msg = msg
self.type = type
self.value = value
self.start_pos = start_pos
class Stack(list):
def get_tos_nodes(self):
tos = self[-1]
return tos[2][1]
def get_tos_first_tokens(self, grammar):
tos = self[-1]
inv_tokens = dict((v, k) for k, v in grammar.tokens.items())
inv_keywords = dict((v, k) for k, v in grammar.keywords.items())
dfa, state, nodes = tos
def check():
for first in dfa[1]:
try:
yield inv_keywords[first]
except KeyError:
yield tokenize.tok_name[inv_tokens[first]]
return sorted(check())
class StackNode(object):
def __init__(self, dfa):
self.dfa = dfa
self.nodes = []
@property
def nonterminal(self):
return self.dfa.from_rule
def __repr__(self):
return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
def _token_to_transition(grammar, type_, value):
# Map from token to label
if type_.contains_syntax:
# Check for reserved words (keywords)
try:
return grammar.reserved_syntax_strings[value]
except KeyError:
pass
return type_
class BaseParser(object):
"""Parser engine.
A Parser instance contains state pertaining to the current token
sequence, and should not be used concurrently by different threads
to parse separate token sequences.
See python/tokenize.py for how to get input tokens by a string.
When a syntax error occurs, error_recovery() is called.
"""
node_map = {}
default_node = tree.Node
@@ -44,15 +125,21 @@ class BaseParser(object):
self._error_recovery = error_recovery
def parse(self, tokens):
self.pgen_parser = PgenParser(
self._pgen_grammar, self.convert_node, self.convert_leaf,
self.error_recovery, self._start_nonterminal
)
first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0]
self.stack = Stack([StackNode(first_dfa)])
node = self.pgen_parser.parse(tokens)
# The stack is empty now, we don't need it anymore.
del self.pgen_parser
return node
for type_, value, start_pos, prefix in tokens:
self.add_token(type_, value, start_pos, prefix)
while self.stack and self.stack[-1].dfa.is_final:
self._pop()
if self.stack:
# We never broke out -- EOF is too soon -- Unfinished statement.
# However, the error recovery might have added the token again, if
# the stack is empty, we're fine.
raise InternalParseError("incomplete input", type_, value, start_pos)
return self.rootnode
def error_recovery(self, pgen_grammar, stack, typ, value, start_pos, prefix,
add_token_callback):
@@ -73,3 +160,48 @@ class BaseParser(object):
return self.leaf_map[type_](value, start_pos, prefix)
except KeyError:
return self.default_leaf(value, start_pos, prefix)
def add_token(self, type_, value, start_pos, prefix):
"""Add a token; return True if this is the end of the program."""
grammar = self._pgen_grammar
stack = self.stack
transition = _token_to_transition(grammar, type_, value)
while True:
try:
plan = stack[-1].dfa.transition_to_plan[transition]
break
except KeyError:
if stack[-1].dfa.is_final:
self._pop()
else:
self.error_recovery(grammar, stack, type_,
value, start_pos, prefix, self.add_token)
return
except IndexError:
raise InternalParseError("too much input", type_, value, start_pos)
stack[-1].dfa = plan.next_dfa
for push in plan.dfa_pushes:
stack.append(StackNode(push))
leaf = self.convert_leaf(grammar, type_, value, prefix, start_pos)
stack[-1].nodes.append(leaf)
def _pop(self):
tos = self.stack.pop()
# If there's exactly one child, return that child instead of
# creating a new node. We still create expr_stmt and
# file_input though, because a lot of Jedi depends on its
# logic.
if len(tos.nodes) == 1:
new_node = tos.nodes[0]
else:
new_node = self.convert_node(self._pgen_grammar, tos.dfa.from_rule, tos.nodes)
try:
self.stack[-1].nodes.append(new_node)
except IndexError:
# Stack is empty, set the rootnode.
self.rootnode = new_node

View File

@@ -8,4 +8,3 @@
# Modifications are dual-licensed: MIT and PSF.
from parso.pgen2.generator import generate_grammar
from parso.pgen2.parse import PgenParser

View File

@@ -1,201 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
"""
Parser engine for the grammar tables generated by pgen.
The grammar table must be loaded first.
See Parser/parser.c in the Python distribution for additional info on
how this parsing engine works.
"""
class InternalParseError(Exception):
"""
Exception to signal the parser is stuck and error recovery didn't help.
Basically this shouldn't happen. It's a sign that something is really
wrong.
"""
def __init__(self, msg, type_, value, start_pos):
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
(msg, type_.name, value, start_pos))
self.msg = msg
self.type = type
self.value = value
self.start_pos = start_pos
class Stack(list):
def get_tos_nodes(self):
tos = self[-1]
return tos[2][1]
def get_tos_first_tokens(self, grammar):
tos = self[-1]
inv_tokens = dict((v, k) for k, v in grammar.tokens.items())
inv_keywords = dict((v, k) for k, v in grammar.keywords.items())
dfa, state, nodes = tos
def check():
for first in dfa[1]:
try:
yield inv_keywords[first]
except KeyError:
yield tokenize.tok_name[inv_tokens[first]]
return sorted(check())
class StackNode(object):
def __init__(self, dfa):
self.dfa = dfa
self.nodes = []
@property
def nonterminal(self):
return self.dfa.from_rule
def __repr__(self):
return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
def _token_to_transition(grammar, type_, value):
# Map from token to label
if type_.contains_syntax:
# Check for reserved words (keywords)
try:
return grammar.reserved_syntax_strings[value]
except KeyError:
pass
return type_
class PgenParser(object):
"""Parser engine.
The proper usage sequence is:
p = Parser(grammar, [converter]) # create instance
p.setup([start]) # prepare for parsing
<for each input token>:
if p.add_token(...): # parse a token
break
root = p.rootnode # root of abstract syntax tree
A Parser instance may be reused by calling setup() repeatedly.
A Parser instance contains state pertaining to the current token
sequence, and should not be used concurrently by different threads
to parse separate token sequences.
See driver.py for how to get input tokens by tokenizing a file or
string.
Parsing is complete when add_token() returns True; the root of the
abstract syntax tree can then be retrieved from the rootnode
instance variable. When a syntax error occurs, error_recovery()
is called. There is no error recovery; the parser cannot be used
after a syntax error was reported (but it can be reinitialized by
calling setup()).
"""
def __init__(self, grammar, convert_node, convert_leaf, error_recovery,
start_nonterminal):
"""Constructor.
The grammar argument is a grammar.Grammar instance; see the
grammar module for more information.
The parser is not ready yet for parsing; you must call the
setup() method to get it started.
The optional convert argument is a function mapping concrete
syntax tree nodes to abstract syntax tree nodes. If not
given, no conversion is done and the syntax tree produced is
the concrete syntax tree. If given, it must be a function of
two arguments, the first being the grammar (a grammar.Grammar
instance), and the second being the concrete syntax tree node
to be converted. The syntax tree is converted from the bottom
up.
A concrete syntax tree node is a (type, nodes) tuple, where
type is the node type (a token or nonterminal number) and nodes
is a list of children for nonterminals, and None for tokens.
An abstract syntax tree node may be anything; this is entirely
up to the converter function.
"""
self.grammar = grammar
self.convert_node = convert_node
self.convert_leaf = convert_leaf
self.stack = Stack([StackNode(grammar.nonterminal_to_dfas[start_nonterminal][0])])
self.error_recovery = error_recovery
def parse(self, tokens):
for type_, value, start_pos, prefix in tokens:
self.add_token(type_, value, start_pos, prefix)
while self.stack and self.stack[-1].dfa.is_final:
self._pop()
if self.stack:
# We never broke out -- EOF is too soon -- Unfinished statement.
# However, the error recovery might have added the token again, if
# the stack is empty, we're fine.
raise InternalParseError("incomplete input", type_, value, start_pos)
return self.rootnode
def add_token(self, type_, value, start_pos, prefix):
"""Add a token; return True if this is the end of the program."""
transition = _token_to_transition(self.grammar, type_, value)
stack = self.stack
grammar = self.grammar
while True:
try:
plan = stack[-1].dfa.transition_to_plan[transition]
break
except KeyError:
if stack[-1].dfa.is_final:
self._pop()
else:
self.error_recovery(grammar, stack, type_,
value, start_pos, prefix, self.add_token)
return
except IndexError:
raise InternalParseError("too much input", type_, value, start_pos)
stack[-1].dfa = plan.next_dfa
for push in plan.dfa_pushes:
stack.append(StackNode(push))
leaf = self.convert_leaf(grammar, type_, value, prefix, start_pos)
stack[-1].nodes.append(leaf)
def _pop(self):
tos = self.stack.pop()
# If there's exactly one child, return that child instead of
# creating a new node. We still create expr_stmt and
# file_input though, because a lot of Jedi depends on its
# logic.
if len(tos.nodes) == 1:
new_node = tos.nodes[0]
else:
new_node = self.convert_node(self.grammar, tos.dfa.from_rule, tos.nodes)
try:
self.stack[-1].nodes.append(new_node)
except IndexError:
# Stack is empty, set the rootnode.
self.rootnode = new_node

View File

@@ -287,7 +287,7 @@ class DiffParser(object):
omitted_first_indent = False
indents = []
tokens = self._tokenizer(lines, (1, 0))
stack = self._active_parser.pgen_parser.stack
stack = self._active_parser.stack
for typ, string, start_pos, prefix in tokens:
start_pos = start_pos[0] + line_offset, start_pos[1]
if typ == PythonTokenTypes.INDENT: