Merge branch 'pgen'

This commit is contained in:
Dave Halter
2018-06-29 18:14:03 +02:00
20 changed files with 774 additions and 1009 deletions

View File

@@ -3,6 +3,11 @@
Changelog
---------
0.3.0 (2018-07-30)
+++++++++++++++++++
- Rewrote the pgen2 parser generator.
0.2.1 (2018-05-21)
+++++++++++++++++++

View File

@@ -43,7 +43,7 @@ from parso.grammar import Grammar, load_grammar
from parso.utils import split_lines, python_bytes_to_unicode
__version__ = '0.2.1'
__version__ = '0.3.0'
def parse(code=None, **kwargs):

View File

@@ -2,11 +2,11 @@ import hashlib
import os
from parso._compatibility import FileNotFoundError, is_pypy
from parso.pgen2.pgen import generate_grammar
from parso.pgen2 import generate_grammar
from parso.utils import split_lines, python_bytes_to_unicode, parse_version_string
from parso.python.diff import DiffParser
from parso.python.tokenize import tokenize_lines, tokenize
from parso.python import token
from parso.python.token import PythonTokenTypes
from parso.cache import parser_cache, load_module, save_module
from parso.parser import BaseParser
from parso.python.parser import Parser as PythonParser
@@ -51,8 +51,8 @@ class Grammar(object):
it is invalid, it will be returned as an error node. If disabled,
you will get a ParseError when encountering syntax errors in your
code.
:param str start_symbol: The grammar symbol that you want to parse. Only
allowed to be used when error_recovery is False.
:param str start_symbol: The grammar rule (nonterminal) that you want
to parse. Only allowed to be used when error_recovery is False.
:param str path: The path to the file you want to open. Only needed for caching.
:param bool cache: Keeps a copy of the parser tree in RAM and on disk
if a path is given. Returns the cached trees if the corresponding
@@ -88,7 +88,7 @@ class Grammar(object):
raise TypeError("Please provide either code or a path.")
if start_symbol is None:
start_symbol = self._start_symbol
start_symbol = self._start_nonterminal
if error_recovery and start_symbol != 'file_input':
raise NotImplementedError("This is currently not implemented.")
@@ -136,7 +136,7 @@ class Grammar(object):
p = self._parser(
self._pgen_grammar,
error_recovery=error_recovery,
start_symbol=start_symbol
start_nonterminal=start_symbol
)
root_node = p.parse(tokens=tokens)
@@ -186,15 +186,15 @@ class Grammar(object):
return normalizer.issues
def __repr__(self):
labels = self._pgen_grammar.number2symbol.values()
txt = ' '.join(list(labels)[:3]) + ' ...'
nonterminals = self._pgen_grammar._nonterminal_to_dfas.keys()
txt = ' '.join(list(nonterminals)[:3]) + ' ...'
return '<%s:%s>' % (self.__class__.__name__, txt)
class PythonGrammar(Grammar):
_error_normalizer_config = ErrorFinderConfig()
_token_namespace = token
_start_symbol = 'file_input'
_token_namespace = PythonTokenTypes
_start_nonterminal = 'file_input'
def __init__(self, version_info, bnf_text):
super(PythonGrammar, self).__init__(

View File

@@ -1,3 +1,11 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
# 99% of the code is different from pgen2, now.
"""
The ``Parser`` tries to convert the available Python code in an easy to read
format, something like an abstract syntax tree. The classes who represent this
@@ -16,7 +24,7 @@ complexity of the ``Parser`` (there's another parser sitting inside
``Statement``, which produces ``Array`` and ``Call``).
"""
from parso import tree
from parso.pgen2.parse import PgenParser
from parso.pgen2.generator import ReservedString
class ParserSyntaxError(Exception):
@@ -30,7 +38,76 @@ class ParserSyntaxError(Exception):
self.error_leaf = error_leaf
class InternalParseError(Exception):
"""
Exception to signal the parser is stuck and error recovery didn't help.
Basically this shouldn't happen. It's a sign that something is really
wrong.
"""
def __init__(self, msg, type_, value, start_pos):
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
(msg, type_.name, value, start_pos))
self.msg = msg
self.type = type
self.value = value
self.start_pos = start_pos
class Stack(list):
def _allowed_transition_names_and_token_types(self):
def iterate():
# An API just for Jedi.
for stack_node in reversed(self):
for transition in stack_node.dfa.transitions:
if isinstance(transition, ReservedString):
yield transition.value
else:
yield transition # A token type
if not stack_node.dfa.is_final:
break
return list(iterate())
class StackNode(object):
def __init__(self, dfa):
self.dfa = dfa
self.nodes = []
@property
def nonterminal(self):
return self.dfa.from_rule
def __repr__(self):
return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
def _token_to_transition(grammar, type_, value):
# Map from token to label
if type_.contains_syntax:
# Check for reserved words (keywords)
try:
return grammar.reserved_syntax_strings[value]
except KeyError:
pass
return type_
class BaseParser(object):
"""Parser engine.
A Parser instance contains state pertaining to the current token
sequence, and should not be used concurrently by different threads
to parse separate token sequences.
See python/tokenize.py for how to get input tokens by a string.
When a syntax error occurs, error_recovery() is called.
"""
node_map = {}
default_node = tree.Node
@@ -38,41 +115,94 @@ class BaseParser(object):
}
default_leaf = tree.Leaf
def __init__(self, pgen_grammar, start_symbol='file_input', error_recovery=False):
def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False):
self._pgen_grammar = pgen_grammar
self._start_symbol = start_symbol
self._start_nonterminal = start_nonterminal
self._error_recovery = error_recovery
def parse(self, tokens):
start_number = self._pgen_grammar.symbol2number[self._start_symbol]
self.pgen_parser = PgenParser(
self._pgen_grammar, self.convert_node, self.convert_leaf,
self.error_recovery, start_number
)
first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0]
self.stack = Stack([StackNode(first_dfa)])
node = self.pgen_parser.parse(tokens)
# The stack is empty now, we don't need it anymore.
del self.pgen_parser
return node
for token in tokens:
self._add_token(token)
def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
add_token_callback):
while True:
tos = self.stack[-1]
if not tos.dfa.is_final:
# We never broke out -- EOF is too soon -- Unfinished statement.
# However, the error recovery might have added the token again, if
# the stack is empty, we're fine.
raise InternalParseError(
"incomplete input", token.type, token.value, token.start_pos
)
if len(self.stack) > 1:
self._pop()
else:
return self.convert_node(tos.nonterminal, tos.nodes)
def error_recovery(self, token):
if self._error_recovery:
raise NotImplementedError("Error Recovery is not implemented")
else:
error_leaf = tree.ErrorLeaf('TODO %s' % typ, value, start_pos, prefix)
type_, value, start_pos, prefix = token
error_leaf = tree.ErrorLeaf('TODO %s' % type_, value, start_pos, prefix)
raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf)
def convert_node(self, pgen_grammar, type_, children):
# TODO REMOVE symbol, we don't want type here.
symbol = pgen_grammar.number2symbol[type_]
def convert_node(self, nonterminal, children):
try:
return self.node_map[symbol](children)
return self.node_map[nonterminal](children)
except KeyError:
return self.default_node(symbol, children)
return self.default_node(nonterminal, children)
def convert_leaf(self, pgen_grammar, type_, value, prefix, start_pos):
def convert_leaf(self, type_, value, prefix, start_pos):
try:
return self.leaf_map[type_](value, start_pos, prefix)
except KeyError:
return self.default_leaf(value, start_pos, prefix)
def _add_token(self, token):
"""
This is the only core function for parsing. Here happens basically
everything. Everything is well prepared by the parser generator and we
only apply the necessary steps here.
"""
grammar = self._pgen_grammar
stack = self.stack
type_, value, start_pos, prefix = token
transition = _token_to_transition(grammar, type_, value)
while True:
try:
plan = stack[-1].dfa.transitions[transition]
break
except KeyError:
if stack[-1].dfa.is_final:
self._pop()
else:
self.error_recovery(token)
return
except IndexError:
raise InternalParseError("too much input", type_, value, start_pos)
stack[-1].dfa = plan.next_dfa
for push in plan.dfa_pushes:
stack.append(StackNode(push))
leaf = self.convert_leaf(type_, value, prefix, start_pos)
stack[-1].nodes.append(leaf)
def _pop(self):
tos = self.stack.pop()
# If there's exactly one child, return that child instead of
# creating a new node. We still create expr_stmt and
# file_input though, because a lot of Jedi depends on its
# logic.
if len(tos.nodes) == 1:
new_node = tos.nodes[0]
else:
new_node = self.convert_node(tos.dfa.from_rule, tos.nodes)
self.stack[-1].nodes.append(new_node)

View File

@@ -6,3 +6,5 @@
# Licensed to PSF under a Contributor Agreement.
# Copyright 2014 David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
from parso.pgen2.generator import generate_grammar

359
parso/pgen2/generator.py Normal file
View File

@@ -0,0 +1,359 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
"""
This module defines the data structures used to represent a grammar.
Specifying grammars in pgen is possible with this grammar::
grammar: (NEWLINE | rule)* ENDMARKER
rule: NAME ':' rhs NEWLINE
rhs: items ('|' items)*
items: item+
item: '[' rhs ']' | atom ['+' | '*']
atom: '(' rhs ')' | NAME | STRING
This grammar is self-referencing.
This parser generator (pgen2) was created by Guido Rossum and used for lib2to3.
Most of the code has been refactored to make it more Pythonic. Since this was a
"copy" of the CPython Parser parser "pgen", there was some work needed to make
it more readable. It should also be slightly faster than the original pgen2,
because we made some optimizations.
"""
from ast import literal_eval
from parso.pgen2.grammar_parser import GrammarParser, NFAState
class Grammar(object):
"""
Once initialized, this class supplies the grammar tables for the
parsing engine implemented by parse.py. The parsing engine
accesses the instance variables directly.
The only important part in this parsers are dfas and transitions between
dfas.
"""
def __init__(self, start_nonterminal, rule_to_dfas, reserved_syntax_strings):
self.nonterminal_to_dfas = rule_to_dfas # Dict[str, List[DFAState]]
self.reserved_syntax_strings = reserved_syntax_strings
self.start_nonterminal = start_nonterminal
class DFAPlan(object):
"""
Plans are used for the parser to create stack nodes and do the proper
DFA state transitions.
"""
def __init__(self, next_dfa, dfa_pushes=[]):
self.next_dfa = next_dfa
self.dfa_pushes = dfa_pushes
def __repr__(self):
return '%s(%s, %s)' % (self.__class__.__name__, self.next_dfa, self.dfa_pushes)
class DFAState(object):
"""
The DFAState object is the core class for pretty much anything. DFAState
are the vertices of an ordered graph while arcs and transitions are the
edges.
Arcs are the initial edges, where most DFAStates are not connected and
transitions are then calculated to connect the DFA state machines that have
different nonterminals.
"""
def __init__(self, from_rule, nfa_set, final):
assert isinstance(nfa_set, set)
assert isinstance(next(iter(nfa_set)), NFAState)
assert isinstance(final, NFAState)
self.from_rule = from_rule
self.nfa_set = nfa_set
self.arcs = {} # map from terminals/nonterminals to DFAState
# In an intermediary step we set these nonterminal arcs (which has the
# same structure as arcs). These don't contain terminals anymore.
self.nonterminal_arcs = {}
# Transitions are basically the only thing that the parser is using
# with is_final. Everyting else is purely here to create a parser.
self.transitions = {} #: Dict[Union[TokenType, ReservedString], DFAPlan]
self.is_final = final in nfa_set
def add_arc(self, next_, label):
assert isinstance(label, str)
assert label not in self.arcs
assert isinstance(next_, DFAState)
self.arcs[label] = next_
def unifystate(self, old, new):
for label, next_ in self.arcs.items():
if next_ is old:
self.arcs[label] = new
def __eq__(self, other):
# Equality test -- ignore the nfa_set instance variable
assert isinstance(other, DFAState)
if self.is_final != other.is_final:
return False
# Can't just return self.arcs == other.arcs, because that
# would invoke this method recursively, with cycles...
if len(self.arcs) != len(other.arcs):
return False
for label, next_ in self.arcs.items():
if next_ is not other.arcs.get(label):
return False
return True
__hash__ = None # For Py3 compatibility.
def __repr__(self):
return '<%s: %s is_final=%s>' % (
self.__class__.__name__, self.from_rule, self.is_final
)
class ReservedString(object):
"""
Most grammars will have certain keywords and operators that are mentioned
in the grammar as strings (e.g. "if") and not token types (e.g. NUMBER).
This class basically is the former.
"""
def __init__(self, value):
self.value = value
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__, self.value)
def _simplify_dfas(dfas):
"""
This is not theoretically optimal, but works well enough.
Algorithm: repeatedly look for two states that have the same
set of arcs (same labels pointing to the same nodes) and
unify them, until things stop changing.
dfas is a list of DFAState instances
"""
changes = True
while changes:
changes = False
for i, state_i in enumerate(dfas):
for j in range(i + 1, len(dfas)):
state_j = dfas[j]
if state_i == state_j:
#print " unify", i, j
del dfas[j]
for state in dfas:
state.unifystate(state_j, state_i)
changes = True
break
def _make_dfas(start, finish):
"""
Uses the powerset construction algorithm to create DFA states from sets of
NFA states.
Also does state reduction if some states are not needed.
"""
# To turn an NFA into a DFA, we define the states of the DFA
# to correspond to *sets* of states of the NFA. Then do some
# state reduction.
assert isinstance(start, NFAState)
assert isinstance(finish, NFAState)
def addclosure(nfa_state, base_nfa_set):
assert isinstance(nfa_state, NFAState)
if nfa_state in base_nfa_set:
return
base_nfa_set.add(nfa_state)
for nfa_arc in nfa_state.arcs:
if nfa_arc.nonterminal_or_string is None:
addclosure(nfa_arc.next, base_nfa_set)
base_nfa_set = set()
addclosure(start, base_nfa_set)
states = [DFAState(start.from_rule, base_nfa_set, finish)]
for state in states: # NB states grows while we're iterating
arcs = {}
# Find state transitions and store them in arcs.
for nfa_state in state.nfa_set:
for nfa_arc in nfa_state.arcs:
if nfa_arc.nonterminal_or_string is not None:
nfa_set = arcs.setdefault(nfa_arc.nonterminal_or_string, set())
addclosure(nfa_arc.next, nfa_set)
# Now create the dfa's with no None's in arcs anymore. All Nones have
# been eliminated and state transitions (arcs) are properly defined, we
# just need to create the dfa's.
for nonterminal_or_string, nfa_set in arcs.items():
for nested_state in states:
if nested_state.nfa_set == nfa_set:
# The DFA state already exists for this rule.
break
else:
nested_state = DFAState(start.from_rule, nfa_set, finish)
states.append(nested_state)
state.add_arc(nested_state, nonterminal_or_string)
return states # List of DFAState instances; first one is start
def _dump_nfa(start, finish):
print("Dump of NFA for", start.from_rule)
todo = [start]
for i, state in enumerate(todo):
print(" State", i, state is finish and "(final)" or "")
for label, next_ in state.arcs:
if next_ in todo:
j = todo.index(next_)
else:
j = len(todo)
todo.append(next_)
if label is None:
print(" -> %d" % j)
else:
print(" %s -> %d" % (label, j))
def _dump_dfas(dfas):
print("Dump of DFA for", dfas[0].from_rule)
for i, state in enumerate(dfas):
print(" State", i, state.is_final and "(final)" or "")
for nonterminal, next_ in state.arcs.items():
print(" %s -> %d" % (nonterminal, dfas.index(next_)))
def generate_grammar(bnf_grammar, token_namespace):
"""
``bnf_text`` is a grammar in extended BNF (using * for repetition, + for
at-least-once repetition, [] for optional parts, | for alternatives and ()
for grouping).
It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its
own parser.
"""
rule_to_dfas = {}
start_nonterminal = None
for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse():
#_dump_nfa(a, z)
dfas = _make_dfas(nfa_a, nfa_z)
#_dump_dfas(dfas)
# oldlen = len(dfas)
_simplify_dfas(dfas)
# newlen = len(dfas)
rule_to_dfas[nfa_a.from_rule] = dfas
#print(nfa_a.from_rule, oldlen, newlen)
if start_nonterminal is None:
start_nonterminal = nfa_a.from_rule
reserved_strings = {}
for nonterminal, dfas in rule_to_dfas.items():
for dfa_state in dfas:
for terminal_or_nonterminal, next_dfa in dfa_state.arcs.items():
if terminal_or_nonterminal in rule_to_dfas:
dfa_state.nonterminal_arcs[terminal_or_nonterminal] = next_dfa
else:
transition = _make_transition(
token_namespace,
reserved_strings,
terminal_or_nonterminal
)
dfa_state.transitions[transition] = DFAPlan(next_dfa)
_calculate_tree_traversal(rule_to_dfas)
return Grammar(start_nonterminal, rule_to_dfas, reserved_strings)
def _make_transition(token_namespace, reserved_syntax_strings, label):
"""
Creates a reserved string ("if", "for", "*", ...) or returns the token type
(NUMBER, STRING, ...) for a given grammar terminal.
"""
if label[0].isalpha():
# A named token (e.g. NAME, NUMBER, STRING)
return getattr(token_namespace, label)
else:
# Either a keyword or an operator
assert label[0] in ('"', "'"), label
assert not label.startswith('"""') and not label.startswith("'''")
# TODO use literal_eval instead of a simple eval.
value = literal_eval(label)
try:
return reserved_syntax_strings[value]
except KeyError:
r = reserved_syntax_strings[value] = ReservedString(value)
return r
def _calculate_tree_traversal(nonterminal_to_dfas):
"""
By this point we know how dfas can move around within a stack node, but we
don't know how we can add a new stack node (nonterminal transitions).
"""
# Map from grammar rule (nonterminal) name to a set of tokens.
first_plans = {}
nonterminals = list(nonterminal_to_dfas.keys())
nonterminals.sort()
for nonterminal in nonterminals:
if nonterminal not in first_plans:
_calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal)
# Now that we have calculated the first terminals, we are sure that
# there is no left recursion or ambiguities.
for dfas in nonterminal_to_dfas.values():
for dfa_state in dfas:
for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items():
for transition, pushes in first_plans[nonterminal].items():
dfa_state.transitions[transition] = DFAPlan(next_dfa, pushes)
def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal):
"""
Calculates the first plan in the first_plans dictionary for every given
nonterminal. This is going to be used to know when to create stack nodes.
"""
dfas = nonterminal_to_dfas[nonterminal]
new_first_plans = {}
first_plans[nonterminal] = None # dummy to detect left recursion
# We only need to check the first dfa. All the following ones are not
# interesting to find first terminals.
state = dfas[0]
for transition, next_ in state.transitions.items():
# It's a string. We have finally found a possible first token.
new_first_plans[transition] = [next_.next_dfa]
for nonterminal2, next_ in state.nonterminal_arcs.items():
# It's a nonterminal and we have either a left recursion issue
# in the grammar or we have to recurse.
try:
first_plans2 = first_plans[nonterminal2]
except KeyError:
first_plans2 = _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal2)
else:
if first_plans2 is None:
raise ValueError("left recursion for rule %r" % nonterminal)
for t, pushes in first_plans2.items():
check = new_first_plans.get(t)
if check is not None:
raise ValueError(
"Rule %s is ambiguous; %s is the"
" start of the rule %s as well as %s."
% (nonterminal, t, nonterminal2, check[-1].from_rule)
)
new_first_plans[t] = [next_] + pushes
first_plans[nonterminal] = new_first_plans
return new_first_plans

View File

@@ -1,97 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
"""This module defines the data structures used to represent a grammar.
These are a bit arcane because they are derived from the data
structures used by Python's 'pgen' parser generator.
There's also a table here mapping operators to their names in the
token module; the Python tokenize module reports all operators as the
fallback token code OP, but the parser needs the actual token code.
"""
class Grammar(object):
"""Pgen parsing tables conversion class.
Once initialized, this class supplies the grammar tables for the
parsing engine implemented by parse.py. The parsing engine
accesses the instance variables directly. The class here does not
provide initialization of the tables; several subclasses exist to
do this (see the conv and pgen modules).
The instance variables are as follows:
symbol2number -- a dict mapping symbol names to numbers. Symbol
numbers are always 256 or higher, to distinguish
them from token numbers, which are between 0 and
255 (inclusive).
number2symbol -- a dict mapping numbers to symbol names;
these two are each other's inverse.
states -- a list of DFAs, where each DFA is a list of
states, each state is a list of arcs, and each
arc is a (i, j) pair where i is a label and j is
a state number. The DFA number is the index into
this list. (This name is slightly confusing.)
Final states are represented by a special arc of
the form (0, j) where j is its own state number.
dfas -- a dict mapping symbol numbers to (DFA, first)
pairs, where DFA is an item from the states list
above, and first is a set of tokens that can
begin this grammar rule (represented by a dict
whose values are always 1).
labels -- a list of (x, y) pairs where x is either a token
number or a symbol number, and y is either None
or a string; the strings are keywords. The label
number is the index in this list; label numbers
are used to mark state transitions (arcs) in the
DFAs.
start -- the number of the grammar's start symbol.
keywords -- a dict mapping keyword strings to arc labels.
tokens -- a dict mapping token numbers to arc labels.
"""
def __init__(self, bnf_text, start_symbol):
self.symbol2number = {}
self.number2symbol = {}
self.states = []
self.dfas = {}
self.labels = [(0, "EMPTY")]
self.keywords = {}
self.tokens = {}
self.symbol2label = {}
self.label2symbol = {}
self.start_symbol = start_symbol
@property
def start(self):
return self.symbol2number[self.start_symbol]
def report(self):
"""Dump the grammar tables to standard output, for debugging."""
from pprint import pprint
print("s2n")
pprint(self.symbol2number)
print("n2s")
pprint(self.number2symbol)
print("states")
pprint(self.states)
print("dfas")
pprint(self.dfas)
print("labels")
pprint(self.labels)
print("start", self.start)

View File

@@ -5,9 +5,9 @@
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
from parso.python import tokenize
from parso.python.tokenize import tokenize
from parso.utils import parse_version_string
from parso.python import token
from parso.python.token import PythonTokenTypes
class GrammarParser():
@@ -16,7 +16,7 @@ class GrammarParser():
"""
def __init__(self, bnf_grammar):
self._bnf_grammar = bnf_grammar
self.generator = tokenize.tokenize(
self.generator = tokenize(
bnf_grammar,
version_info=parse_version_string('3.6')
)
@@ -24,16 +24,16 @@ class GrammarParser():
def parse(self):
# grammar: (NEWLINE | rule)* ENDMARKER
while self.type != token.ENDMARKER:
while self.type == token.NEWLINE:
while self.type != PythonTokenTypes.ENDMARKER:
while self.type == PythonTokenTypes.NEWLINE:
self._gettoken()
# rule: NAME ':' rhs NEWLINE
self._current_rule_name = self._expect(token.NAME)
self._expect(token.COLON)
self._current_rule_name = self._expect(PythonTokenTypes.NAME)
self._expect(PythonTokenTypes.OP, ':')
a, z = self._parse_rhs()
self._expect(token.NEWLINE)
self._expect(PythonTokenTypes.NEWLINE)
yield a, z
@@ -60,7 +60,8 @@ class GrammarParser():
def _parse_items(self):
# items: item+
a, b = self._parse_item()
while self.type in (token.NAME, token.STRING, token.LPAR, token.LSQB):
while self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING) \
or self.value in ('(', '['):
c, d = self._parse_item()
# Need to end on the next item.
b.add_arc(c)
@@ -72,7 +73,7 @@ class GrammarParser():
if self.value == "[":
self._gettoken()
a, z = self._parse_rhs()
self._expect(token.RSQB)
self._expect(PythonTokenTypes.OP, ']')
# Make it also possible that there is no token and change the
# state.
a.add_arc(z)
@@ -97,9 +98,9 @@ class GrammarParser():
if self.value == "(":
self._gettoken()
a, z = self._parse_rhs()
self._expect(token.RPAR)
self._expect(PythonTokenTypes.OP, ')')
return a, z
elif self.type in (token.NAME, token.STRING):
elif self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING):
a = NFAState(self._current_rule_name)
z = NFAState(self._current_rule_name)
# Make it clear that the state transition requires that value.
@@ -110,10 +111,12 @@ class GrammarParser():
self._raise_error("expected (...) or NAME or STRING, got %s/%s",
self.type, self.value)
def _expect(self, type):
if self.type != type:
self._raise_error("expected %s(%s), got %s(%s)",
type, token.tok_name[type], self.type, self.value)
def _expect(self, type_, value=None):
if self.type != type_:
self._raise_error("expected %s, got %s [%s]",
type_, self.type, self.value)
if value is not None and self.value != value:
self._raise_error("expected %s, got %s", value, self.value)
value = self.value
self._gettoken()
return value
@@ -134,20 +137,20 @@ class GrammarParser():
class NFAArc(object):
def __init__(self, next_, label_or_string):
def __init__(self, next_, nonterminal_or_string):
self.next = next_
self.label_or_string = label_or_string
self.nonterminal_or_string = nonterminal_or_string
class NFAState(object):
def __init__(self, from_rule):
self.from_rule = from_rule
self.arcs = [] # list of (label, NFAState) pairs
self.arcs = [] # List[nonterminal (str), NFAState]
def add_arc(self, next_, label=None):
assert label is None or isinstance(label, str)
def add_arc(self, next_, nonterminal_or_string=None):
assert nonterminal_or_string is None or isinstance(nonterminal_or_string, str)
assert isinstance(next_, NFAState)
self.arcs.append(NFAArc(next_, label))
self.arcs.append(NFAArc(next_, nonterminal_or_string))
def __repr__(self):
return '<%s: from %s>' % (self.__class__.__name__, self.from_rule)

View File

@@ -1,238 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
"""
Parser engine for the grammar tables generated by pgen.
The grammar table must be loaded first.
See Parser/parser.c in the Python distribution for additional info on
how this parsing engine works.
"""
from parso.python import tokenize
class InternalParseError(Exception):
"""
Exception to signal the parser is stuck and error recovery didn't help.
Basically this shouldn't happen. It's a sign that something is really
wrong.
"""
def __init__(self, msg, type, value, start_pos):
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
(msg, tokenize.tok_name[type], value, start_pos))
self.msg = msg
self.type = type
self.value = value
self.start_pos = start_pos
class Stack(list):
def get_tos_nodes(self):
tos = self[-1]
return tos[2][1]
def get_tos_first_tokens(self, grammar):
tos = self[-1]
inv_tokens = dict((v, k) for k, v in grammar.tokens.items())
inv_keywords = dict((v, k) for k, v in grammar.keywords.items())
dfa, state, nodes = tos
def check():
for first in dfa[1]:
try:
yield inv_keywords[first]
except KeyError:
yield tokenize.tok_name[inv_tokens[first]]
return sorted(check())
def token_to_ilabel(grammar, type_, value):
# Map from token to label
if type_ == tokenize.NAME:
# Check for reserved words (keywords)
try:
return grammar.keywords[value]
except KeyError:
pass
try:
return grammar.tokens[type_]
except KeyError:
return None
class PgenParser(object):
"""Parser engine.
The proper usage sequence is:
p = Parser(grammar, [converter]) # create instance
p.setup([start]) # prepare for parsing
<for each input token>:
if p.add_token(...): # parse a token
break
root = p.rootnode # root of abstract syntax tree
A Parser instance may be reused by calling setup() repeatedly.
A Parser instance contains state pertaining to the current token
sequence, and should not be used concurrently by different threads
to parse separate token sequences.
See driver.py for how to get input tokens by tokenizing a file or
string.
Parsing is complete when add_token() returns True; the root of the
abstract syntax tree can then be retrieved from the rootnode
instance variable. When a syntax error occurs, error_recovery()
is called. There is no error recovery; the parser cannot be used
after a syntax error was reported (but it can be reinitialized by
calling setup()).
"""
def __init__(self, grammar, convert_node, convert_leaf, error_recovery, start):
"""Constructor.
The grammar argument is a grammar.Grammar instance; see the
grammar module for more information.
The parser is not ready yet for parsing; you must call the
setup() method to get it started.
The optional convert argument is a function mapping concrete
syntax tree nodes to abstract syntax tree nodes. If not
given, no conversion is done and the syntax tree produced is
the concrete syntax tree. If given, it must be a function of
two arguments, the first being the grammar (a grammar.Grammar
instance), and the second being the concrete syntax tree node
to be converted. The syntax tree is converted from the bottom
up.
A concrete syntax tree node is a (type, nodes) tuple, where
type is the node type (a token or symbol number) and nodes
is a list of children for symbols, and None for tokens.
An abstract syntax tree node may be anything; this is entirely
up to the converter function.
"""
self.grammar = grammar
self.convert_node = convert_node
self.convert_leaf = convert_leaf
# Each stack entry is a tuple: (dfa, state, node).
# A node is a tuple: (type, children),
# where children is a list of nodes or None
newnode = (start, [])
stackentry = (self.grammar.dfas[start], 0, newnode)
self.stack = Stack([stackentry])
self.rootnode = None
self.error_recovery = error_recovery
def parse(self, tokens):
for type_, value, start_pos, prefix in tokens:
if self.add_token(type_, value, start_pos, prefix):
break
else:
# We never broke out -- EOF is too soon -- Unfinished statement.
# However, the error recovery might have added the token again, if
# the stack is empty, we're fine.
if self.stack:
raise InternalParseError("incomplete input", type_, value, start_pos)
return self.rootnode
def add_token(self, type_, value, start_pos, prefix):
"""Add a token; return True if this is the end of the program."""
ilabel = token_to_ilabel(self.grammar, type_, value)
# Loop until the token is shifted; may raise exceptions
_gram = self.grammar
_labels = _gram.labels
_push = self._push
_pop = self._pop
_shift = self._shift
while True:
dfa, state, node = self.stack[-1]
states, first = dfa
arcs = states[state]
# Look for a state with this label
for i, newstate in arcs:
t, v = _labels[i]
if ilabel == i:
# Look it up in the list of labels
assert t < 256
# Shift a token; we're done with it
_shift(type_, value, newstate, prefix, start_pos)
# Pop while we are in an accept-only state
state = newstate
while states[state] == [(0, state)]:
_pop()
if not self.stack:
# Done parsing!
return True
dfa, state, node = self.stack[-1]
states, first = dfa
# Done with this token
return False
elif t >= 256:
# See if it's a symbol and if we're in its first set
itsdfa = _gram.dfas[t]
itsstates, itsfirst = itsdfa
if ilabel in itsfirst:
# Push a symbol
_push(t, itsdfa, newstate)
break # To continue the outer while loop
else:
if (0, state) in arcs:
# An accepting state, pop it and try something else
_pop()
if not self.stack:
# Done parsing, but another token is input
raise InternalParseError("too much input", type_, value, start_pos)
else:
self.error_recovery(self.grammar, self.stack, arcs, type_,
value, start_pos, prefix, self.add_token)
break
def _shift(self, type_, value, newstate, prefix, start_pos):
"""Shift a token. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = self.convert_leaf(self.grammar, type_, value, prefix, start_pos)
node[-1].append(newnode)
self.stack[-1] = (dfa, newstate, node)
def _push(self, type_, newdfa, newstate):
"""Push a nonterminal. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = (type_, [])
self.stack[-1] = (dfa, newstate, node)
self.stack.append((newdfa, 0, newnode))
def _pop(self):
"""Pop a nonterminal. (Internal)"""
popdfa, popstate, (type_, children) = self.stack.pop()
# If there's exactly one child, return that child instead of creating a
# new node. We still create expr_stmt and file_input though, because a
# lot of Jedi depends on its logic.
if len(children) == 1:
newnode = children[0]
else:
newnode = self.convert_node(self.grammar, type_, children)
try:
# Equal to:
# dfa, state, node = self.stack[-1]
# symbol, children = node
self.stack[-1][2][1].append(newnode)
except IndexError:
# Stack is empty, set the rootnode.
self.rootnode = newnode

View File

@@ -1,303 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
"""
Specifying grammars in pgen is possible with this grammar::
grammar: (NEWLINE | rule)* ENDMARKER
rule: NAME ':' rhs NEWLINE
rhs: items ('|' items)*
items: item+
item: '[' rhs ']' | atom ['+' | '*']
atom: '(' rhs ')' | NAME | STRING
This grammar is self-referencing.
"""
from parso.pgen2.grammar import Grammar
from parso.python import token
from parso.pgen2.grammar_parser import GrammarParser, NFAState
class ParserGenerator(object):
def __init__(self, rule_to_dfas, token_namespace):
self._token_namespace = token_namespace
self._rule_to_dfas = rule_to_dfas
def make_grammar(self, grammar):
self._first = {} # map from symbol name to set of tokens
names = list(self._rule_to_dfas.keys())
names.sort()
for name in names:
if name not in self._first:
self._calcfirst(name)
#print name, self._first[name].keys()
i = 256 + len(grammar.symbol2number)
grammar.symbol2number[name] = i
grammar.number2symbol[i] = name
for name in names:
dfas = self._rule_to_dfas[name]
states = []
for state in dfas:
arcs = []
for label, next in state.arcs.items():
arcs.append((self._make_label(grammar, label), dfas.index(next)))
if state.isfinal:
arcs.append((0, dfas.index(state)))
states.append(arcs)
grammar.states.append(states)
grammar.dfas[grammar.symbol2number[name]] = (states, self._make_first(grammar, name))
return grammar
def _make_first(self, grammar, name):
rawfirst = self._first[name]
first = {}
for label in rawfirst:
ilabel = self._make_label(grammar, label)
##assert ilabel not in first # XXX failed on <> ... !=
first[ilabel] = 1
return first
def _make_label(self, grammar, label):
# XXX Maybe this should be a method on a subclass of converter?
ilabel = len(grammar.labels)
if label[0].isalpha():
# Either a symbol name or a named token
if label in grammar.symbol2number:
# A symbol name (a non-terminal)
if label in grammar.symbol2label:
return grammar.symbol2label[label]
else:
grammar.labels.append((grammar.symbol2number[label], None))
grammar.symbol2label[label] = ilabel
grammar.label2symbol[ilabel] = label
return ilabel
else:
# A named token (NAME, NUMBER, STRING)
itoken = getattr(self._token_namespace, label, None)
assert isinstance(itoken, int), label
if itoken in grammar.tokens:
return grammar.tokens[itoken]
else:
grammar.labels.append((itoken, None))
grammar.tokens[itoken] = ilabel
return ilabel
else:
# Either a keyword or an operator
assert label[0] in ('"', "'"), label
value = eval(label)
if value[0].isalpha():
# A keyword
if value in grammar.keywords:
return grammar.keywords[value]
else:
# TODO this might be an issue?! Using token.NAME here?
grammar.labels.append((token.NAME, value))
grammar.keywords[value] = ilabel
return ilabel
else:
# An operator (any non-numeric token)
itoken = self._token_namespace.generate_token_id(value)
if itoken in grammar.tokens:
return grammar.tokens[itoken]
else:
grammar.labels.append((itoken, None))
grammar.tokens[itoken] = ilabel
return ilabel
def _calcfirst(self, name):
dfa = self._rule_to_dfas[name]
self._first[name] = None # dummy to detect left recursion
state = dfa[0]
totalset = {}
overlapcheck = {}
for label, next in state.arcs.items():
if label in self._rule_to_dfas:
if label in self._first:
fset = self._first[label]
if fset is None:
raise ValueError("left recursion for rule %r" % name)
else:
self._calcfirst(label)
fset = self._first[label]
totalset.update(fset)
overlapcheck[label] = fset
else:
totalset[label] = 1
overlapcheck[label] = {label: 1}
inverse = {}
for label, itsfirst in overlapcheck.items():
for symbol in itsfirst:
if symbol in inverse:
raise ValueError("rule %s is ambiguous; %s is in the"
" first sets of %s as well as %s" %
(name, symbol, label, inverse[symbol]))
inverse[symbol] = label
self._first[name] = totalset
class DFAState(object):
def __init__(self, from_rule, nfa_set, final):
assert isinstance(nfa_set, set)
assert isinstance(next(iter(nfa_set)), NFAState)
assert isinstance(final, NFAState)
self.from_rule = from_rule
self.nfa_set = nfa_set
self.isfinal = final in nfa_set
self.arcs = {} # map from label to DFAState
def add_arc(self, next_, label):
assert isinstance(label, str)
assert label not in self.arcs
assert isinstance(next_, DFAState)
self.arcs[label] = next_
def unifystate(self, old, new):
for label, next in self.arcs.items():
if next is old:
self.arcs[label] = new
def __eq__(self, other):
# Equality test -- ignore the nfa_set instance variable
assert isinstance(other, DFAState)
if self.isfinal != other.isfinal:
return False
# Can't just return self.arcs == other.arcs, because that
# would invoke this method recursively, with cycles...
if len(self.arcs) != len(other.arcs):
return False
for label, next in self.arcs.items():
if next is not other.arcs.get(label):
return False
return True
__hash__ = None # For Py3 compatibility.
def _simplify_dfas(dfas):
# This is not theoretically optimal, but works well enough.
# Algorithm: repeatedly look for two states that have the same
# set of arcs (same labels pointing to the same nodes) and
# unify them, until things stop changing.
# dfas is a list of DFAState instances
changes = True
while changes:
changes = False
for i, state_i in enumerate(dfas):
for j in range(i + 1, len(dfas)):
state_j = dfas[j]
if state_i == state_j:
#print " unify", i, j
del dfas[j]
for state in dfas:
state.unifystate(state_j, state_i)
changes = True
break
def _make_dfas(start, finish):
"""
This is basically doing what the powerset construction algorithm is doing.
"""
# To turn an NFA into a DFA, we define the states of the DFA
# to correspond to *sets* of states of the NFA. Then do some
# state reduction.
assert isinstance(start, NFAState)
assert isinstance(finish, NFAState)
def addclosure(nfa_state, base_nfa_set):
assert isinstance(nfa_state, NFAState)
if nfa_state in base_nfa_set:
return
base_nfa_set.add(nfa_state)
for nfa_arc in nfa_state.arcs:
if nfa_arc.label_or_string is None:
addclosure(nfa_arc.next, base_nfa_set)
base_nfa_set = set()
addclosure(start, base_nfa_set)
states = [DFAState(start.from_rule, base_nfa_set, finish)]
for state in states: # NB states grows while we're iterating
arcs = {}
# Find state transitions and store them in arcs.
for nfa_state in state.nfa_set:
for nfa_arc in nfa_state.arcs:
if nfa_arc.label_or_string is not None:
nfa_set = arcs.setdefault(nfa_arc.label_or_string, set())
addclosure(nfa_arc.next, nfa_set)
# Now create the dfa's with no None's in arcs anymore. All Nones have
# been eliminated and state transitions (arcs) are properly defined, we
# just need to create the dfa's.
for label_or_string, nfa_set in arcs.items():
for nested_state in states:
if nested_state.nfa_set == nfa_set:
# The DFA state already exists for this rule.
break
else:
nested_state = DFAState(start.from_rule, nfa_set, finish)
states.append(nested_state)
state.add_arc(nested_state, label_or_string)
return states # List of DFAState instances; first one is start
def _dump_nfa(start, finish):
print("Dump of NFA for", start.from_rule)
todo = [start]
for i, state in enumerate(todo):
print(" State", i, state is finish and "(final)" or "")
for label, next in state.arcs:
if next in todo:
j = todo.index(next)
else:
j = len(todo)
todo.append(next)
if label is None:
print(" -> %d" % j)
else:
print(" %s -> %d" % (label, j))
def _dump_dfas(dfas):
print("Dump of DFA for", dfas[0].from_rule)
for i, state in enumerate(dfas):
print(" State", i, state.isfinal and "(final)" or "")
for label, next in state.arcs.items():
print(" %s -> %d" % (label, dfas.index(next)))
def generate_grammar(bnf_grammar, token_namespace):
"""
``bnf_text`` is a grammar in extended BNF (using * for repetition, + for
at-least-once repetition, [] for optional parts, | for alternatives and ()
for grouping).
It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its
own parser.
"""
rule_to_dfas = {}
start_symbol = None
for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse():
#_dump_nfa(a, z)
dfas = _make_dfas(nfa_a, nfa_z)
#_dump_dfas(dfas)
# oldlen = len(dfas)
_simplify_dfas(dfas)
# newlen = len(dfas)
rule_to_dfas[nfa_a.from_rule] = dfas
#print(nfa_a.from_rule, oldlen, newlen)
if start_symbol is None:
start_symbol = nfa_a.from_rule
p = ParserGenerator(rule_to_dfas, token_namespace)
return p.make_grammar(Grammar(bnf_grammar, start_symbol))

View File

@@ -13,8 +13,8 @@ import logging
from parso.utils import split_lines
from parso.python.parser import Parser
from parso.python.tree import EndMarker
from parso.python.tokenize import (NEWLINE, PythonToken, ERROR_DEDENT,
ENDMARKER, INDENT, DEDENT)
from parso.python.tokenize import PythonToken
from parso.python.token import PythonTokenTypes
LOG = logging.getLogger(__name__)
@@ -29,7 +29,7 @@ def _get_last_line(node_or_leaf):
def _ends_with_newline(leaf, suffix=''):
if leaf.type == 'error_leaf':
typ = leaf.original_type
typ = leaf.token_type.lower()
else:
typ = leaf.type
@@ -41,9 +41,8 @@ def _flows_finished(pgen_grammar, stack):
if, while, for and try might not be finished, because another part might
still be parsed.
"""
for dfa, newstate, (symbol_number, nodes) in stack:
if pgen_grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt',
'for_stmt', 'try_stmt'):
for stack_node in stack:
if stack_node.nonterminal in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'):
return False
return True
@@ -52,10 +51,10 @@ def suite_or_file_input_is_valid(pgen_grammar, stack):
if not _flows_finished(pgen_grammar, stack):
return False
for dfa, newstate, (symbol_number, nodes) in reversed(stack):
if pgen_grammar.number2symbol[symbol_number] == 'suite':
for stack_node in reversed(stack):
if stack_node.nonterminal == 'suite':
# If only newline is in the suite, the suite is not valid, yet.
return len(nodes) > 1
return len(stack_node.nodes) > 1
# Not reaching a suite means that we're dealing with file_input levels
# where there's no need for a valid statement in it. It can also be empty.
return True
@@ -168,8 +167,7 @@ class DiffParser(object):
def _enabled_debugging(self, old_lines, lines_new):
if self._module.get_code() != ''.join(lines_new):
LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines),
''.join(lines_new))
LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new))
def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new):
copied_nodes = [None]
@@ -273,7 +271,6 @@ class DiffParser(object):
# memoryview?
parsed_until_line = self._nodes_stack.parsed_until_line
lines_after = self._parser_lines_new[parsed_until_line:]
#print('parse_content', parsed_until_line, lines_after, until_line)
tokens = self._diff_tokenize(
lines_after,
until_line,
@@ -290,10 +287,10 @@ class DiffParser(object):
omitted_first_indent = False
indents = []
tokens = self._tokenizer(lines, (1, 0))
stack = self._active_parser.pgen_parser.stack
stack = self._active_parser.stack
for typ, string, start_pos, prefix in tokens:
start_pos = start_pos[0] + line_offset, start_pos[1]
if typ == INDENT:
if typ == PythonTokenTypes.INDENT:
indents.append(start_pos[1])
if is_first_token:
omitted_first_indent = True
@@ -306,8 +303,9 @@ class DiffParser(object):
# In case of omitted_first_indent, it might not be dedented fully.
# However this is a sign for us that a dedent happened.
if typ == DEDENT \
or typ == ERROR_DEDENT and omitted_first_indent and len(indents) == 1:
if typ == PythonTokenTypes.DEDENT \
or typ == PythonTokenTypes.ERROR_DEDENT \
and omitted_first_indent and len(indents) == 1:
indents.pop()
if omitted_first_indent and not indents:
# We are done here, only thing that can come now is an
@@ -317,18 +315,22 @@ class DiffParser(object):
prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix)
else:
prefix = ''
yield PythonToken(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix)
yield PythonToken(
PythonTokenTypes.ENDMARKER, '',
(start_pos[0] + line_offset, 0),
prefix
)
break
elif typ == NEWLINE and start_pos[0] >= until_line:
elif typ == PythonTokenTypes.NEWLINE and start_pos[0] >= until_line:
yield PythonToken(typ, string, start_pos, prefix)
# Check if the parser is actually in a valid suite state.
if suite_or_file_input_is_valid(self._pgen_grammar, stack):
start_pos = start_pos[0] + 1, 0
while len(indents) > int(omitted_first_indent):
indents.pop()
yield PythonToken(DEDENT, '', start_pos, '')
yield PythonToken(PythonTokenTypes.DEDENT, '', start_pos, '')
yield PythonToken(ENDMARKER, '', start_pos, '')
yield PythonToken(PythonTokenTypes.ENDMARKER, '', start_pos, '')
break
else:
continue
@@ -537,7 +539,7 @@ class _NodesStack(object):
line_offset_index = -2
elif (new_nodes[-1].type in ('error_leaf', 'error_node') or
_is_flow_node(new_nodes[-1])):
_is_flow_node(new_nodes[-1])):
# Error leafs/nodes don't have a defined start/end. Error
# nodes might not end with a newline (e.g. if there's an
# open `(`). Therefore ignore all of them unless they are

View File

@@ -306,12 +306,12 @@ class ErrorFinder(Normalizer):
def visit_leaf(self, leaf):
if leaf.type == 'error_leaf':
if leaf.original_type in ('indent', 'error_dedent'):
if leaf.token_type in ('INDENT', 'ERROR_DEDENT'):
# Indents/Dedents itself never have a prefix. They are just
# "pseudo" tokens that get removed by the syntax tree later.
# Therefore in case of an error we also have to check for this.
spacing = list(leaf.get_next_leaf()._split_prefix())[-1]
if leaf.original_type == 'indent':
if leaf.token_type == 'INDENT':
message = 'unexpected indent'
else:
message = 'unindent does not match any outer indentation level'

View File

@@ -1,9 +1,11 @@
from parso.python import tree
from parso.python.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER,
STRING, tok_name, NAME, FSTRING_STRING,
FSTRING_START, FSTRING_END)
from parso.python.token import PythonTokenTypes
from parso.parser import BaseParser
from parso.pgen2.parse import token_to_ilabel
NAME = PythonTokenTypes.NAME
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
class Parser(BaseParser):
@@ -53,53 +55,33 @@ class Parser(BaseParser):
# Names/Keywords are handled separately
_leaf_map = {
STRING: tree.String,
NUMBER: tree.Number,
NEWLINE: tree.Newline,
ENDMARKER: tree.EndMarker,
FSTRING_STRING: tree.FStringString,
FSTRING_START: tree.FStringStart,
FSTRING_END: tree.FStringEnd,
PythonTokenTypes.STRING: tree.String,
PythonTokenTypes.NUMBER: tree.Number,
PythonTokenTypes.NEWLINE: tree.Newline,
PythonTokenTypes.ENDMARKER: tree.EndMarker,
PythonTokenTypes.FSTRING_STRING: tree.FStringString,
PythonTokenTypes.FSTRING_START: tree.FStringStart,
PythonTokenTypes.FSTRING_END: tree.FStringEnd,
}
def __init__(self, pgen_grammar, error_recovery=True, start_symbol='file_input'):
super(Parser, self).__init__(pgen_grammar, start_symbol, error_recovery=error_recovery)
def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'):
super(Parser, self).__init__(pgen_grammar, start_nonterminal,
error_recovery=error_recovery)
self.syntax_errors = []
self._omit_dedent_list = []
self._indent_counter = 0
# TODO do print absolute import detection here.
# try:
# del python_grammar_no_print_statement.keywords["print"]
# except KeyError:
# pass # Doesn't exist in the Python 3 grammar.
# if self.options["print_function"]:
# python_grammar = pygram.python_grammar_no_print_statement
# else:
def parse(self, tokens):
if self._error_recovery:
if self._start_symbol != 'file_input':
if self._start_nonterminal != 'file_input':
raise NotImplementedError
tokens = self._recovery_tokenize(tokens)
node = super(Parser, self).parse(tokens)
return super(Parser, self).parse(tokens)
if self._start_symbol == 'file_input' != node.type:
# If there's only one statement, we get back a non-module. That's
# not what we want, we want a module, so we add it here:
node = self.convert_node(
self._pgen_grammar,
self._pgen_grammar.symbol2number['file_input'],
[node]
)
return node
def convert_node(self, pgen_grammar, type, children):
def convert_node(self, nonterminal, children):
"""
Convert raw node information to a PythonBaseNode instance.
@@ -107,154 +89,114 @@ class Parser(BaseParser):
grammar rule produces a new complete node, so that the tree is build
strictly bottom-up.
"""
# TODO REMOVE symbol, we don't want type here.
symbol = pgen_grammar.number2symbol[type]
try:
return self.node_map[symbol](children)
return self.node_map[nonterminal](children)
except KeyError:
if symbol == 'suite':
if nonterminal == 'suite':
# We don't want the INDENT/DEDENT in our parser tree. Those
# leaves are just cancer. They are virtual leaves and not real
# ones and therefore have pseudo start/end positions and no
# prefixes. Just ignore them.
children = [children[0]] + children[2:-1]
elif symbol == 'list_if':
elif nonterminal == 'list_if':
# Make transitioning from 2 to 3 easier.
symbol = 'comp_if'
elif symbol == 'listmaker':
nonterminal = 'comp_if'
elif nonterminal == 'listmaker':
# Same as list_if above.
symbol = 'testlist_comp'
return self.default_node(symbol, children)
nonterminal = 'testlist_comp'
return self.default_node(nonterminal, children)
def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos):
def convert_leaf(self, type, value, prefix, start_pos):
# print('leaf', repr(value), token.tok_name[type])
if type == NAME:
if value in pgen_grammar.keywords:
if value in self._pgen_grammar.reserved_syntax_strings:
return tree.Keyword(value, start_pos, prefix)
else:
return tree.Name(value, start_pos, prefix)
return self._leaf_map.get(type, tree.Operator)(value, start_pos, prefix)
def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
add_token_callback):
def get_symbol_and_nodes(stack):
for dfa, state, (type_, nodes) in stack:
symbol = pgen_grammar.number2symbol[type_]
yield symbol, nodes
tos_nodes = stack.get_tos_nodes()
def error_recovery(self, token):
tos_nodes = self.stack[-1].nodes
if tos_nodes:
last_leaf = tos_nodes[-1].get_last_leaf()
else:
last_leaf = None
if self._start_symbol == 'file_input' and \
(typ == ENDMARKER or typ == DEDENT and '\n' not in last_leaf.value):
def reduce_stack(states, newstate):
# reduce
state = newstate
while states[state] == [(0, state)]:
self.pgen_parser._pop()
dfa, state, (type_, nodes) = stack[-1]
states, first = dfa
if self._start_nonterminal == 'file_input' and \
(token.type == PythonTokenTypes.ENDMARKER or
token.type == DEDENT and '\n' not in last_leaf.value):
# In Python statements need to end with a newline. But since it's
# possible (and valid in Python ) that there's no newline at the
# end of a file, we have to recover even if the user doesn't want
# error recovery.
#print('x', pprint.pprint(stack))
ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value)
dfa, state, (type_, nodes) = stack[-1]
symbol = pgen_grammar.number2symbol[type_]
states, first = dfa
arcs = states[state]
# Look for a state with this label
for i, newstate in arcs:
if ilabel == i:
if symbol == 'simple_stmt':
# This is basically shifting
stack[-1] = (dfa, newstate, (type_, nodes))
reduce_stack(states, newstate)
add_token_callback(typ, value, start_pos, prefix)
if self.stack[-1].dfa.from_rule == 'simple_stmt':
try:
plan = self.stack[-1].dfa.transitions[PythonTokenTypes.NEWLINE]
except KeyError:
pass
else:
if plan.next_dfa.is_final and not plan.dfa_pushes:
# We are ignoring here that the newline would be
# required for a simple_stmt.
self.stack[-1].dfa = plan.next_dfa
self._add_token(token)
return
# Check if we're at the right point
#for symbol, nodes in get_symbol_and_nodes(stack):
# self.pgen_parser._pop()
#break
break
#symbol = pgen_grammar.number2symbol[type_]
if not self._error_recovery:
return super(Parser, self).error_recovery(
pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
add_token_callback)
return super(Parser, self).error_recovery(token)
def current_suite(stack):
# For now just discard everything that is not a suite or
# file_input, if we detect an error.
one_line_suite = False
for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))):
for until_index, stack_node in reversed(list(enumerate(stack))):
# `suite` can sometimes be only simple_stmt, not stmt.
if one_line_suite:
if stack_node.nonterminal == 'file_input':
break
elif symbol == 'file_input':
break
elif symbol == 'suite':
if len(nodes) > 1:
elif stack_node.nonterminal == 'suite':
# In the case where we just have a newline we don't want to
# do error recovery here. In all other cases, we want to do
# error recovery.
if len(stack_node.nodes) != 1:
break
elif not nodes:
one_line_suite = True
# `suite` without an indent are error nodes.
return index, symbol, nodes
return until_index
index, symbol, nodes = current_suite(stack)
until_index = current_suite(self.stack)
# print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
if self._stack_removal(pgen_grammar, stack, arcs, index + 1, value, start_pos):
add_token_callback(typ, value, start_pos, prefix)
if self._stack_removal(until_index + 1):
self._add_token(token)
else:
typ, value, start_pos, prefix = token
if typ == INDENT:
# For every deleted INDENT we have to delete a DEDENT as well.
# Otherwise the parser will get into trouble and DEDENT too early.
self._omit_dedent_list.append(self._indent_counter)
error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix)
stack[-1][2][1].append(error_leaf)
error_leaf = tree.PythonErrorLeaf(typ.name, value, start_pos, prefix)
self.stack[-1].nodes.append(error_leaf)
if symbol == 'suite':
dfa, state, node = stack[-1]
states, first = dfa
arcs = states[state]
intended_label = pgen_grammar.symbol2label['stmt']
# Introduce a proper state transition. We're basically allowing
# there to be no valid statements inside a suite.
if [x[0] for x in arcs] == [intended_label]:
new_state = arcs[0][1]
stack[-1] = dfa, new_state, node
tos = self.stack[-1]
if tos.nonterminal == 'suite':
# Need at least one statement in the suite. This happend with the
# error recovery above.
try:
tos.dfa = tos.dfa.arcs['stmt']
except KeyError:
# We're already in a final state.
pass
def _stack_removal(self, pgen_grammar, stack, arcs, start_index, value, start_pos):
failed_stack = False
found = False
all_nodes = []
for dfa, state, (type_, nodes) in stack[start_index:]:
if nodes:
found = True
if found:
failed_stack = True
all_nodes += nodes
if failed_stack:
stack[start_index - 1][2][1].append(tree.PythonErrorNode(all_nodes))
def _stack_removal(self, start_index):
all_nodes = [node for stack_node in self.stack[start_index:] for node in stack_node.nodes]
stack[start_index:] = []
return failed_stack
if all_nodes:
self.stack[start_index - 1].nodes.append(tree.PythonErrorNode(all_nodes))
self.stack[start_index:] = []
return bool(all_nodes)
def _recovery_tokenize(self, tokens):
for typ, value, start_pos, prefix in tokens:
for token in tokens:
typ = token[0]
# print(tok_name[typ], repr(value), start_pos, repr(prefix))
if typ == DEDENT:
# We need to count indents, because if we just omit any DEDENT,
@@ -267,4 +209,4 @@ class Parser(BaseParser):
self._indent_counter -= 1
elif typ == INDENT:
self._indent_counter += 1
yield typ, value, start_pos, prefix
yield token

View File

@@ -1,115 +1,27 @@
from __future__ import absolute_import
from itertools import count
from token import *
from parso._compatibility import py_version
# Don't mutate the standard library dict
tok_name = tok_name.copy()
_counter = count(N_TOKENS)
# Never want to see this thing again.
del N_TOKENS
COMMENT = next(_counter)
tok_name[COMMENT] = 'COMMENT'
NL = next(_counter)
tok_name[NL] = 'NL'
# Sets the attributes that don't exist in these tok_name versions.
if py_version >= 30:
BACKQUOTE = next(_counter)
tok_name[BACKQUOTE] = 'BACKQUOTE'
else:
RARROW = next(_counter)
tok_name[RARROW] = 'RARROW'
ELLIPSIS = next(_counter)
tok_name[ELLIPSIS] = 'ELLIPSIS'
if py_version < 35:
ATEQUAL = next(_counter)
tok_name[ATEQUAL] = 'ATEQUAL'
ERROR_DEDENT = next(_counter)
tok_name[ERROR_DEDENT] = 'ERROR_DEDENT'
FSTRING_START = next(_counter)
tok_name[FSTRING_START] = 'FSTRING_START'
FSTRING_END = next(_counter)
tok_name[FSTRING_END] = 'FSTRING_END'
FSTRING_STRING = next(_counter)
tok_name[FSTRING_STRING] = 'FSTRING_STRING'
EXCLAMATION = next(_counter)
tok_name[EXCLAMATION] = 'EXCLAMATION'
# Map from operator to number (since tokenize doesn't do this)
opmap_raw = """\
( LPAR
) RPAR
[ LSQB
] RSQB
: COLON
, COMMA
; SEMI
+ PLUS
- MINUS
* STAR
/ SLASH
| VBAR
& AMPER
< LESS
> GREATER
= EQUAL
. DOT
% PERCENT
` BACKQUOTE
{ LBRACE
} RBRACE
@ AT
== EQEQUAL
!= NOTEQUAL
<> NOTEQUAL
<= LESSEQUAL
>= GREATEREQUAL
~ TILDE
^ CIRCUMFLEX
<< LEFTSHIFT
>> RIGHTSHIFT
** DOUBLESTAR
+= PLUSEQUAL
-= MINEQUAL
*= STAREQUAL
/= SLASHEQUAL
%= PERCENTEQUAL
&= AMPEREQUAL
|= VBAREQUAL
@= ATEQUAL
^= CIRCUMFLEXEQUAL
<<= LEFTSHIFTEQUAL
>>= RIGHTSHIFTEQUAL
**= DOUBLESTAREQUAL
// DOUBLESLASH
//= DOUBLESLASHEQUAL
-> RARROW
... ELLIPSIS
! EXCLAMATION
"""
opmap = {}
for line in opmap_raw.splitlines():
op, name = line.split()
opmap[op] = globals()[name]
def generate_token_id(string):
class TokenType(object):
def __init__(self, name, contains_syntax=False):
self.name = name
self.contains_syntax = contains_syntax
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__, self.name)
class TokenTypes(object):
"""
Uses a token in the grammar (e.g. `'+'` or `'and'`returns the corresponding
ID for it. The strings are part of the grammar file.
Basically an enum, but Python 2 doesn't have enums in the standard library.
"""
try:
return opmap[string]
except KeyError:
pass
return globals()[string]
def __init__(self, names, contains_syntax):
for name in names:
setattr(self, name, TokenType(name, contains_syntax=name in contains_syntax))
PythonTokenTypes = TokenTypes((
'STRING', 'NUMBER', 'NAME', 'ERRORTOKEN', 'NEWLINE', 'INDENT', 'DEDENT',
'ERROR_DEDENT', 'FSTRING_STRING', 'FSTRING_START', 'FSTRING_END', 'OP',
'ENDMARKER'),
contains_syntax=('NAME', 'OP'),
)

View File

@@ -18,14 +18,25 @@ from collections import namedtuple
import itertools as _itertools
from codecs import BOM_UTF8
from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap,
NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT,
ERROR_DEDENT, FSTRING_STRING, FSTRING_START,
FSTRING_END)
from parso.python.token import PythonTokenTypes
from parso._compatibility import py_version
from parso.utils import split_lines
STRING = PythonTokenTypes.STRING
NAME = PythonTokenTypes.NAME
NUMBER = PythonTokenTypes.NUMBER
OP = PythonTokenTypes.OP
NEWLINE = PythonTokenTypes.NEWLINE
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
ENDMARKER = PythonTokenTypes.ENDMARKER
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
FSTRING_START = PythonTokenTypes.FSTRING_START
FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
FSTRING_END = PythonTokenTypes.FSTRING_END
TokenCollection = namedtuple(
'TokenCollection',
'pseudo_token single_quoted triple_quoted endpats whitespace '
@@ -242,12 +253,9 @@ class Token(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
class PythonToken(Token):
def _get_type_name(self, exact=True):
return tok_name[self.type]
def __repr__(self):
return ('TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)' %
self._replace(type=self._get_type_name()))
self._replace(type=self.type.name))
class FStringNode(object):
@@ -396,7 +404,9 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
endmatch = endprog.match(line)
if endmatch:
pos = endmatch.end(0)
yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix)
yield PythonToken(
STRING, contstr + line[:pos],
contstr_start, prefix)
contstr = ''
contline = None
else:
@@ -571,13 +581,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
and fstring_stack[-1].parentheses_count == 1:
fstring_stack[-1].format_spec_count += 1
try:
# This check is needed in any case to check if it's a valid
# operator or just some random unicode character.
typ = opmap[token]
except KeyError:
typ = ERRORTOKEN
yield PythonToken(typ, token, spos, prefix)
yield PythonToken(OP, token, spos, prefix)
if contstr:
yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)

View File

@@ -124,7 +124,7 @@ class PythonLeaf(PythonMixin, Leaf):
# indent error leafs somehow? No idea how, though.
previous_leaf = self.get_previous_leaf()
if previous_leaf is not None and previous_leaf.type == 'error_leaf' \
and previous_leaf.original_type in ('indent', 'error_dedent'):
and previous_leaf.token_type in ('INDENT', 'ERROR_DEDENT'):
previous_leaf = previous_leaf.get_previous_leaf()
if previous_leaf is None:

View File

@@ -229,6 +229,7 @@ class Leaf(NodeOrLeaf):
class TypedLeaf(Leaf):
__slots__ = ('type',)
def __init__(self, type, value, start_pos, prefix=''):
super(TypedLeaf, self).__init__(value, start_pos, prefix)
self.type = type
@@ -351,13 +352,13 @@ class ErrorLeaf(Leaf):
A leaf that is either completely invalid in a language (like `$` in Python)
or is invalid at that position. Like the star in `1 +* 1`.
"""
__slots__ = ('original_type',)
__slots__ = ('token_type',)
type = 'error_leaf'
def __init__(self, original_type, value, start_pos, prefix=''):
def __init__(self, token_type, value, start_pos, prefix=''):
super(ErrorLeaf, self).__init__(value, start_pos, prefix)
self.original_type = original_type
self.token_type = token_type
def __repr__(self):
return "<%s: %s:%s, %s>" % \
(type(self).__name__, self.original_type, repr(self.value), self.start_pos)
(type(self).__name__, self.token_type, repr(self.value), self.start_pos)

View File

@@ -11,25 +11,51 @@ def test_with_stmt():
assert module.children[2].type == 'name'
def test_one_line_function():
module = parse('def x(): f.')
def test_one_line_function(each_version):
module = parse('def x(): f.', version=each_version)
assert module.children[0].type == 'funcdef'
def_, name, parameters, colon, f = module.children[0].children
assert f.type == 'error_node'
module = parse('def x(a:')
module = parse('def x(a:', version=each_version)
func = module.children[0]
assert func.type == 'error_node'
assert func.children[-1] == ':'
if each_version.startswith('2'):
assert func.children[-1].value == 'a'
else:
assert func.children[-1] == ':'
def test_if_else():
module = parse('if x:\n f.\nelse:\n g(')
if_stmt = module.children[0]
if_, test, colon, suite1, else_, colon, suite2 = if_stmt.children
f = suite1.children[1]
assert f.type == 'error_node'
assert f.children[0].value == 'f'
assert f.children[1].value == '.'
g = suite2.children[1]
assert g.children[0].value == 'g'
assert g.children[1].value == '('
def test_if_stmt():
module = parse('if x: f.')# \nelse: g(
module = parse('if x: f.\nelse: g(')
if_stmt = module.children[0]
assert if_stmt.type == 'if_stmt'
if_, test, colon, f = if_stmt.children
assert f.type == 'error_node'
assert f.children[0].value == 'f'
assert f.children[1].value == '.'
#assert g.children[0].value == 'g'
#assert g.children[1].value == '('
assert module.children[1].type == 'newline'
assert module.children[1].value == '\n'
assert module.children[2].type == 'error_leaf'
assert module.children[2].value == 'else'
assert module.children[3].type == 'error_leaf'
assert module.children[3].value == ':'
in_else_stmt = module.children[4]
assert in_else_stmt.type == 'error_node'
assert in_else_stmt.children[0].value == 'g'
assert in_else_stmt.children[1].value == '('

View File

@@ -12,6 +12,8 @@ import pytest
from parso import load_grammar
from parso import ParserSyntaxError
from parso.pgen2 import generate_grammar
from parso.python import tokenize
def _parse(code, version=None):
@@ -270,3 +272,19 @@ def py_br(each_version):
def test_py3_rb(works_ge_py3):
works_ge_py3.parse("rb'1'")
works_ge_py3.parse("RB'1'")
def test_left_recursion():
with pytest.raises(ValueError, match='left recursion'):
generate_grammar('foo: foo NAME\n', tokenize.PythonTokenTypes)
def test_ambiguities():
with pytest.raises(ValueError, match='ambiguous'):
generate_grammar('foo: bar | baz\nbar: NAME\nbaz: NAME\n', tokenize.PythonTokenTypes)
with pytest.raises(ValueError, match='ambiguous'):
generate_grammar('''foo: bar | baz\nbar: 'x'\nbaz: "x"\n''', tokenize.PythonTokenTypes)
with pytest.raises(ValueError, match='ambiguous'):
generate_grammar('''foo: bar | 'x'\nbar: 'x'\n''', tokenize.PythonTokenTypes)

View File

@@ -1,20 +1,30 @@
# -*- coding: utf-8 # This file contains Unicode characters.
from textwrap import dedent
import tokenize as stdlib_tokenize
import pytest
from parso._compatibility import py_version
from parso.utils import split_lines, parse_version_string
from parso.python.token import (
NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER, ERROR_DEDENT,
FSTRING_START)
from parso.python.token import PythonTokenTypes
from parso.python import tokenize
from parso import parse
from parso.python.tokenize import PythonToken
# To make it easier to access some of the token types, just put them here.
NAME = PythonTokenTypes.NAME
NEWLINE = PythonTokenTypes.NEWLINE
STRING = PythonTokenTypes.STRING
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
OP = PythonTokenTypes.OP
ENDMARKER = PythonTokenTypes.ENDMARKER
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
FSTRING_START = PythonTokenTypes.FSTRING_START
def _get_token_list(string):
# Load the current version.
version_info = parse_version_string()
@@ -128,7 +138,7 @@ def test_identifier_contains_unicode():
else:
# Unicode tokens in Python 2 seem to be identified as operators.
# They will be ignored in the parser, that's ok.
assert unicode_token[0] == tokenize.ERRORTOKEN
assert unicode_token[0] == OP
def test_quoted_strings():
@@ -188,17 +198,17 @@ def test_ur_literals():
def test_error_literal():
error_token, endmarker = _get_token_list('"\n')
assert error_token.type == tokenize.ERRORTOKEN
assert error_token.type == ERRORTOKEN
assert endmarker.prefix == ''
assert error_token.string == '"\n'
assert endmarker.type == tokenize.ENDMARKER
assert endmarker.type == ENDMARKER
assert endmarker.prefix == ''
bracket, error_token, endmarker = _get_token_list('( """')
assert error_token.type == tokenize.ERRORTOKEN
assert error_token.type == ERRORTOKEN
assert error_token.prefix == ' '
assert error_token.string == '"""'
assert endmarker.type == tokenize.ENDMARKER
assert endmarker.type == ENDMARKER
assert endmarker.prefix == ''
@@ -236,14 +246,3 @@ def test_error_string():
assert t1.prefix == ' '
assert t1.string == '"\n'
assert endmarker.string == ''
def test_tok_name_copied():
# Make sure parso doesn't mutate the standard library
tok_len = len(stdlib_tokenize.tok_name)
correct_len = stdlib_tokenize.N_TOKENS
if 'N_TOKENS' in stdlib_tokenize.tok_name.values(): # Python 3.7
correct_len += 1
if 'NT_OFFSET' in stdlib_tokenize.tok_name.values(): # Not there in PyPy
correct_len += 1
assert tok_len == correct_len