mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-07 05:14:29 +08:00
Compare commits
122 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ff67de248f | ||
|
|
1af5d9d46b | ||
|
|
fce3ead829 | ||
|
|
55d5d39c53 | ||
|
|
c8bf23b787 | ||
|
|
98c9a1ec7f | ||
|
|
ecdb90d9bc | ||
|
|
375ebf2181 | ||
|
|
badb2fe010 | ||
|
|
8e118c913c | ||
|
|
52fc8fc569 | ||
|
|
97cdb448d4 | ||
|
|
603b67ee6d | ||
|
|
7686273287 | ||
|
|
692436ba12 | ||
|
|
f7d3d4e82f | ||
|
|
edce279dee | ||
|
|
a9e40eb578 | ||
|
|
b14f518306 | ||
|
|
8407894b25 | ||
|
|
e4efebc9f3 | ||
|
|
f66e47c540 | ||
|
|
706a92ee0d | ||
|
|
91d864b23d | ||
|
|
e20f2069ba | ||
|
|
4cf198285a | ||
|
|
30cf491b4f | ||
|
|
c1675da0cb | ||
|
|
7b7b66eb3c | ||
|
|
5d46c3e18b | ||
|
|
e9fde82512 | ||
|
|
a46ecbb499 | ||
|
|
da5aa8a2ab | ||
|
|
43d4a8a834 | ||
|
|
309033ae2d | ||
|
|
2a9d8632fe | ||
|
|
530a324643 | ||
|
|
71003bc20e | ||
|
|
c5d141bf60 | ||
|
|
e958b241c7 | ||
|
|
34ab35558f | ||
|
|
03de9cebb8 | ||
|
|
6098d89150 | ||
|
|
ff4358cd97 | ||
|
|
b5378e4602 | ||
|
|
33e321a539 | ||
|
|
a890ddd6cc | ||
|
|
1362d4f05d | ||
|
|
532aef2342 | ||
|
|
878b4b2d3b | ||
|
|
87299335c4 | ||
|
|
4f0e9c0fd7 | ||
|
|
67ca091631 | ||
|
|
4e5ba02dbb | ||
|
|
a85f544901 | ||
|
|
9e8066c6fd | ||
|
|
68eab72229 | ||
|
|
d9264609f2 | ||
|
|
79c7e0b59d | ||
|
|
f03a87b876 | ||
|
|
2a082d69df | ||
|
|
e6fc739670 | ||
|
|
12e11b3d16 | ||
|
|
cc8038966b | ||
|
|
31aecf2d35 | ||
|
|
d8554d86d1 | ||
|
|
d691bf0fd1 | ||
|
|
5712ffb5ca | ||
|
|
55d6a69aad | ||
|
|
453471eeb6 | ||
|
|
a06c3a3129 | ||
|
|
73ce57428b | ||
|
|
640f544af9 | ||
|
|
b6cbf306d7 | ||
|
|
95e4ecf592 | ||
|
|
fbed1ecfe0 | ||
|
|
1f27fa9320 | ||
|
|
23362ec2d3 | ||
|
|
6b391af071 | ||
|
|
c43cb21a0e | ||
|
|
24346a0d32 | ||
|
|
9d452ec66a | ||
|
|
567e0d7aed | ||
|
|
1f02327cff | ||
|
|
8c348aee6f | ||
|
|
a277ccf288 | ||
|
|
a5ce2caab6 | ||
|
|
da4df9c0f1 | ||
|
|
bd444df417 | ||
|
|
275dbca1b9 | ||
|
|
9a0b6f4928 | ||
|
|
fc5560874b | ||
|
|
6e5a520e7b | ||
|
|
dcabf3d415 | ||
|
|
3bc82d112d | ||
|
|
ec186a78f8 | ||
|
|
3818fb2b22 | ||
|
|
95ddeb4012 | ||
|
|
f638abb08e | ||
|
|
f8558df27a | ||
|
|
bae56e72e1 | ||
|
|
41c38311f7 | ||
|
|
eeb456a6d4 | ||
|
|
1c0956d9e0 | ||
|
|
c17156bd36 | ||
|
|
8865aa452c | ||
|
|
e0c79a9fcc | ||
|
|
3c08b1b058 | ||
|
|
0f32673092 | ||
|
|
1e18163402 | ||
|
|
cef9f1bdbd | ||
|
|
23db71a5f7 | ||
|
|
34154d05a0 | ||
|
|
6f385bdba1 | ||
|
|
4fc31c58b3 | ||
|
|
689decc66c | ||
|
|
c2eacdb81c | ||
|
|
ac0bf4fcdd | ||
|
|
948f9ccecc | ||
|
|
f20106d88e | ||
|
|
f4912f6c17 | ||
|
|
bf5a4b7c2c |
@@ -3,7 +3,6 @@ sudo: false
|
||||
python:
|
||||
- 2.6
|
||||
- 2.7
|
||||
- 3.3
|
||||
- 3.4
|
||||
- 3.5
|
||||
- 3.6
|
||||
|
||||
@@ -3,6 +3,17 @@
|
||||
Changelog
|
||||
---------
|
||||
|
||||
0.3.0 (2018-07-30)
|
||||
+++++++++++++++++++
|
||||
|
||||
- Rewrote the pgen2 parser generator.
|
||||
|
||||
0.2.1 (2018-05-21)
|
||||
+++++++++++++++++++
|
||||
|
||||
- A bugfix for the diff parser.
|
||||
- Grammar files can now be loaded from a specific path.
|
||||
|
||||
0.2.0 (2018-04-15)
|
||||
+++++++++++++++++++
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ from parso.grammar import Grammar, load_grammar
|
||||
from parso.utils import split_lines, python_bytes_to_unicode
|
||||
|
||||
|
||||
__version__ = '0.2.0'
|
||||
__version__ = '0.3.0'
|
||||
|
||||
|
||||
def parse(code=None, **kwargs):
|
||||
|
||||
@@ -2,11 +2,11 @@ import hashlib
|
||||
import os
|
||||
|
||||
from parso._compatibility import FileNotFoundError, is_pypy
|
||||
from parso.pgen2.pgen import generate_grammar
|
||||
from parso.pgen2 import generate_grammar
|
||||
from parso.utils import split_lines, python_bytes_to_unicode, parse_version_string
|
||||
from parso.python.diff import DiffParser
|
||||
from parso.python.tokenize import tokenize_lines, tokenize
|
||||
from parso.python import token
|
||||
from parso.python.token import PythonTokenTypes
|
||||
from parso.cache import parser_cache, load_module, save_module
|
||||
from parso.parser import BaseParser
|
||||
from parso.python.parser import Parser as PythonParser
|
||||
@@ -20,7 +20,7 @@ class Grammar(object):
|
||||
"""
|
||||
:py:func:`parso.load_grammar` returns instances of this class.
|
||||
|
||||
Creating custom grammars by calling this is not supported, yet.
|
||||
Creating custom none-python grammars by calling this is not supported, yet.
|
||||
"""
|
||||
#:param text: A BNF representation of your grammar.
|
||||
_error_normalizer_config = None
|
||||
@@ -51,8 +51,8 @@ class Grammar(object):
|
||||
it is invalid, it will be returned as an error node. If disabled,
|
||||
you will get a ParseError when encountering syntax errors in your
|
||||
code.
|
||||
:param str start_symbol: The grammar symbol that you want to parse. Only
|
||||
allowed to be used when error_recovery is False.
|
||||
:param str start_symbol: The grammar rule (nonterminal) that you want
|
||||
to parse. Only allowed to be used when error_recovery is False.
|
||||
:param str path: The path to the file you want to open. Only needed for caching.
|
||||
:param bool cache: Keeps a copy of the parser tree in RAM and on disk
|
||||
if a path is given. Returns the cached trees if the corresponding
|
||||
@@ -88,7 +88,7 @@ class Grammar(object):
|
||||
raise TypeError("Please provide either code or a path.")
|
||||
|
||||
if start_symbol is None:
|
||||
start_symbol = self._start_symbol
|
||||
start_symbol = self._start_nonterminal
|
||||
|
||||
if error_recovery and start_symbol != 'file_input':
|
||||
raise NotImplementedError("This is currently not implemented.")
|
||||
@@ -136,7 +136,7 @@ class Grammar(object):
|
||||
p = self._parser(
|
||||
self._pgen_grammar,
|
||||
error_recovery=error_recovery,
|
||||
start_symbol=start_symbol
|
||||
start_nonterminal=start_symbol
|
||||
)
|
||||
root_node = p.parse(tokens=tokens)
|
||||
|
||||
@@ -186,15 +186,15 @@ class Grammar(object):
|
||||
return normalizer.issues
|
||||
|
||||
def __repr__(self):
|
||||
labels = self._pgen_grammar.number2symbol.values()
|
||||
txt = ' '.join(list(labels)[:3]) + ' ...'
|
||||
nonterminals = self._pgen_grammar._nonterminal_to_dfas.keys()
|
||||
txt = ' '.join(list(nonterminals)[:3]) + ' ...'
|
||||
return '<%s:%s>' % (self.__class__.__name__, txt)
|
||||
|
||||
|
||||
class PythonGrammar(Grammar):
|
||||
_error_normalizer_config = ErrorFinderConfig()
|
||||
_token_namespace = token
|
||||
_start_symbol = 'file_input'
|
||||
_token_namespace = PythonTokenTypes
|
||||
_start_nonterminal = 'file_input'
|
||||
|
||||
def __init__(self, version_info, bnf_text):
|
||||
super(PythonGrammar, self).__init__(
|
||||
@@ -219,12 +219,13 @@ def load_grammar(**kwargs):
|
||||
version.
|
||||
|
||||
:param str version: A python version string, e.g. ``version='3.3'``.
|
||||
:param str path: A path to a grammar file
|
||||
"""
|
||||
def load_grammar(language='python', version=None):
|
||||
def load_grammar(language='python', version=None, path=None):
|
||||
if language == 'python':
|
||||
version_info = parse_version_string(version)
|
||||
|
||||
file = os.path.join(
|
||||
file = path or os.path.join(
|
||||
'python',
|
||||
'grammar%s%s.txt' % (version_info.major, version_info.minor)
|
||||
)
|
||||
|
||||
172
parso/parser.py
172
parso/parser.py
@@ -1,3 +1,11 @@
|
||||
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
# Modifications:
|
||||
# Copyright David Halter and Contributors
|
||||
# Modifications are dual-licensed: MIT and PSF.
|
||||
# 99% of the code is different from pgen2, now.
|
||||
|
||||
"""
|
||||
The ``Parser`` tries to convert the available Python code in an easy to read
|
||||
format, something like an abstract syntax tree. The classes who represent this
|
||||
@@ -16,7 +24,7 @@ complexity of the ``Parser`` (there's another parser sitting inside
|
||||
``Statement``, which produces ``Array`` and ``Call``).
|
||||
"""
|
||||
from parso import tree
|
||||
from parso.pgen2.parse import PgenParser
|
||||
from parso.pgen2.generator import ReservedString
|
||||
|
||||
|
||||
class ParserSyntaxError(Exception):
|
||||
@@ -30,7 +38,76 @@ class ParserSyntaxError(Exception):
|
||||
self.error_leaf = error_leaf
|
||||
|
||||
|
||||
class InternalParseError(Exception):
|
||||
"""
|
||||
Exception to signal the parser is stuck and error recovery didn't help.
|
||||
Basically this shouldn't happen. It's a sign that something is really
|
||||
wrong.
|
||||
"""
|
||||
|
||||
def __init__(self, msg, type_, value, start_pos):
|
||||
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
|
||||
(msg, type_.name, value, start_pos))
|
||||
self.msg = msg
|
||||
self.type = type
|
||||
self.value = value
|
||||
self.start_pos = start_pos
|
||||
|
||||
|
||||
class Stack(list):
|
||||
def _allowed_transition_names_and_token_types(self):
|
||||
def iterate():
|
||||
# An API just for Jedi.
|
||||
for stack_node in reversed(self):
|
||||
for transition in stack_node.dfa.transitions:
|
||||
if isinstance(transition, ReservedString):
|
||||
yield transition.value
|
||||
else:
|
||||
yield transition # A token type
|
||||
|
||||
if not stack_node.dfa.is_final:
|
||||
break
|
||||
|
||||
return list(iterate())
|
||||
|
||||
|
||||
class StackNode(object):
|
||||
def __init__(self, dfa):
|
||||
self.dfa = dfa
|
||||
self.nodes = []
|
||||
|
||||
@property
|
||||
def nonterminal(self):
|
||||
return self.dfa.from_rule
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
|
||||
|
||||
|
||||
def _token_to_transition(grammar, type_, value):
|
||||
# Map from token to label
|
||||
if type_.contains_syntax:
|
||||
# Check for reserved words (keywords)
|
||||
try:
|
||||
return grammar.reserved_syntax_strings[value]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return type_
|
||||
|
||||
|
||||
class BaseParser(object):
|
||||
"""Parser engine.
|
||||
|
||||
A Parser instance contains state pertaining to the current token
|
||||
sequence, and should not be used concurrently by different threads
|
||||
to parse separate token sequences.
|
||||
|
||||
See python/tokenize.py for how to get input tokens by a string.
|
||||
|
||||
When a syntax error occurs, error_recovery() is called.
|
||||
"""
|
||||
|
||||
node_map = {}
|
||||
default_node = tree.Node
|
||||
|
||||
@@ -38,41 +115,94 @@ class BaseParser(object):
|
||||
}
|
||||
default_leaf = tree.Leaf
|
||||
|
||||
def __init__(self, pgen_grammar, start_symbol='file_input', error_recovery=False):
|
||||
def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False):
|
||||
self._pgen_grammar = pgen_grammar
|
||||
self._start_symbol = start_symbol
|
||||
self._start_nonterminal = start_nonterminal
|
||||
self._error_recovery = error_recovery
|
||||
|
||||
def parse(self, tokens):
|
||||
start_number = self._pgen_grammar.symbol2number[self._start_symbol]
|
||||
self.pgen_parser = PgenParser(
|
||||
self._pgen_grammar, self.convert_node, self.convert_leaf,
|
||||
self.error_recovery, start_number
|
||||
)
|
||||
first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0]
|
||||
self.stack = Stack([StackNode(first_dfa)])
|
||||
|
||||
node = self.pgen_parser.parse(tokens)
|
||||
# The stack is empty now, we don't need it anymore.
|
||||
del self.pgen_parser
|
||||
return node
|
||||
for token in tokens:
|
||||
self._add_token(token)
|
||||
|
||||
def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
|
||||
add_token_callback):
|
||||
while True:
|
||||
tos = self.stack[-1]
|
||||
if not tos.dfa.is_final:
|
||||
# We never broke out -- EOF is too soon -- Unfinished statement.
|
||||
# However, the error recovery might have added the token again, if
|
||||
# the stack is empty, we're fine.
|
||||
raise InternalParseError(
|
||||
"incomplete input", token.type, token.value, token.start_pos
|
||||
)
|
||||
|
||||
if len(self.stack) > 1:
|
||||
self._pop()
|
||||
else:
|
||||
return self.convert_node(tos.nonterminal, tos.nodes)
|
||||
|
||||
def error_recovery(self, token):
|
||||
if self._error_recovery:
|
||||
raise NotImplementedError("Error Recovery is not implemented")
|
||||
else:
|
||||
error_leaf = tree.ErrorLeaf('TODO %s' % typ, value, start_pos, prefix)
|
||||
type_, value, start_pos, prefix = token
|
||||
error_leaf = tree.ErrorLeaf('TODO %s' % type_, value, start_pos, prefix)
|
||||
raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf)
|
||||
|
||||
def convert_node(self, pgen_grammar, type_, children):
|
||||
# TODO REMOVE symbol, we don't want type here.
|
||||
symbol = pgen_grammar.number2symbol[type_]
|
||||
def convert_node(self, nonterminal, children):
|
||||
try:
|
||||
return self.node_map[symbol](children)
|
||||
return self.node_map[nonterminal](children)
|
||||
except KeyError:
|
||||
return self.default_node(symbol, children)
|
||||
return self.default_node(nonterminal, children)
|
||||
|
||||
def convert_leaf(self, pgen_grammar, type_, value, prefix, start_pos):
|
||||
def convert_leaf(self, type_, value, prefix, start_pos):
|
||||
try:
|
||||
return self.leaf_map[type_](value, start_pos, prefix)
|
||||
except KeyError:
|
||||
return self.default_leaf(value, start_pos, prefix)
|
||||
|
||||
def _add_token(self, token):
|
||||
"""
|
||||
This is the only core function for parsing. Here happens basically
|
||||
everything. Everything is well prepared by the parser generator and we
|
||||
only apply the necessary steps here.
|
||||
"""
|
||||
grammar = self._pgen_grammar
|
||||
stack = self.stack
|
||||
type_, value, start_pos, prefix = token
|
||||
transition = _token_to_transition(grammar, type_, value)
|
||||
|
||||
while True:
|
||||
try:
|
||||
plan = stack[-1].dfa.transitions[transition]
|
||||
break
|
||||
except KeyError:
|
||||
if stack[-1].dfa.is_final:
|
||||
self._pop()
|
||||
else:
|
||||
self.error_recovery(token)
|
||||
return
|
||||
except IndexError:
|
||||
raise InternalParseError("too much input", type_, value, start_pos)
|
||||
|
||||
stack[-1].dfa = plan.next_dfa
|
||||
|
||||
for push in plan.dfa_pushes:
|
||||
stack.append(StackNode(push))
|
||||
|
||||
leaf = self.convert_leaf(type_, value, prefix, start_pos)
|
||||
stack[-1].nodes.append(leaf)
|
||||
|
||||
def _pop(self):
|
||||
tos = self.stack.pop()
|
||||
# If there's exactly one child, return that child instead of
|
||||
# creating a new node. We still create expr_stmt and
|
||||
# file_input though, because a lot of Jedi depends on its
|
||||
# logic.
|
||||
if len(tos.nodes) == 1:
|
||||
new_node = tos.nodes[0]
|
||||
else:
|
||||
new_node = self.convert_node(tos.dfa.from_rule, tos.nodes)
|
||||
|
||||
self.stack[-1].nodes.append(new_node)
|
||||
|
||||
@@ -4,5 +4,7 @@
|
||||
# Modifications:
|
||||
# Copyright 2006 Google, Inc. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
# Copyright 2014 David Halter. Integration into Jedi.
|
||||
# Copyright 2014 David Halter and Contributors
|
||||
# Modifications are dual-licensed: MIT and PSF.
|
||||
|
||||
from parso.pgen2.generator import generate_grammar
|
||||
|
||||
359
parso/pgen2/generator.py
Normal file
359
parso/pgen2/generator.py
Normal file
@@ -0,0 +1,359 @@
|
||||
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
# Modifications:
|
||||
# Copyright David Halter and Contributors
|
||||
# Modifications are dual-licensed: MIT and PSF.
|
||||
|
||||
"""
|
||||
This module defines the data structures used to represent a grammar.
|
||||
|
||||
Specifying grammars in pgen is possible with this grammar::
|
||||
|
||||
grammar: (NEWLINE | rule)* ENDMARKER
|
||||
rule: NAME ':' rhs NEWLINE
|
||||
rhs: items ('|' items)*
|
||||
items: item+
|
||||
item: '[' rhs ']' | atom ['+' | '*']
|
||||
atom: '(' rhs ')' | NAME | STRING
|
||||
|
||||
This grammar is self-referencing.
|
||||
|
||||
This parser generator (pgen2) was created by Guido Rossum and used for lib2to3.
|
||||
Most of the code has been refactored to make it more Pythonic. Since this was a
|
||||
"copy" of the CPython Parser parser "pgen", there was some work needed to make
|
||||
it more readable. It should also be slightly faster than the original pgen2,
|
||||
because we made some optimizations.
|
||||
"""
|
||||
|
||||
from ast import literal_eval
|
||||
|
||||
from parso.pgen2.grammar_parser import GrammarParser, NFAState
|
||||
|
||||
|
||||
class Grammar(object):
|
||||
"""
|
||||
Once initialized, this class supplies the grammar tables for the
|
||||
parsing engine implemented by parse.py. The parsing engine
|
||||
accesses the instance variables directly.
|
||||
|
||||
The only important part in this parsers are dfas and transitions between
|
||||
dfas.
|
||||
"""
|
||||
|
||||
def __init__(self, start_nonterminal, rule_to_dfas, reserved_syntax_strings):
|
||||
self.nonterminal_to_dfas = rule_to_dfas # Dict[str, List[DFAState]]
|
||||
self.reserved_syntax_strings = reserved_syntax_strings
|
||||
self.start_nonterminal = start_nonterminal
|
||||
|
||||
|
||||
class DFAPlan(object):
|
||||
"""
|
||||
Plans are used for the parser to create stack nodes and do the proper
|
||||
DFA state transitions.
|
||||
"""
|
||||
def __init__(self, next_dfa, dfa_pushes=[]):
|
||||
self.next_dfa = next_dfa
|
||||
self.dfa_pushes = dfa_pushes
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%s, %s)' % (self.__class__.__name__, self.next_dfa, self.dfa_pushes)
|
||||
|
||||
|
||||
class DFAState(object):
|
||||
"""
|
||||
The DFAState object is the core class for pretty much anything. DFAState
|
||||
are the vertices of an ordered graph while arcs and transitions are the
|
||||
edges.
|
||||
|
||||
Arcs are the initial edges, where most DFAStates are not connected and
|
||||
transitions are then calculated to connect the DFA state machines that have
|
||||
different nonterminals.
|
||||
"""
|
||||
def __init__(self, from_rule, nfa_set, final):
|
||||
assert isinstance(nfa_set, set)
|
||||
assert isinstance(next(iter(nfa_set)), NFAState)
|
||||
assert isinstance(final, NFAState)
|
||||
self.from_rule = from_rule
|
||||
self.nfa_set = nfa_set
|
||||
self.arcs = {} # map from terminals/nonterminals to DFAState
|
||||
# In an intermediary step we set these nonterminal arcs (which has the
|
||||
# same structure as arcs). These don't contain terminals anymore.
|
||||
self.nonterminal_arcs = {}
|
||||
|
||||
# Transitions are basically the only thing that the parser is using
|
||||
# with is_final. Everyting else is purely here to create a parser.
|
||||
self.transitions = {} #: Dict[Union[TokenType, ReservedString], DFAPlan]
|
||||
self.is_final = final in nfa_set
|
||||
|
||||
def add_arc(self, next_, label):
|
||||
assert isinstance(label, str)
|
||||
assert label not in self.arcs
|
||||
assert isinstance(next_, DFAState)
|
||||
self.arcs[label] = next_
|
||||
|
||||
def unifystate(self, old, new):
|
||||
for label, next_ in self.arcs.items():
|
||||
if next_ is old:
|
||||
self.arcs[label] = new
|
||||
|
||||
def __eq__(self, other):
|
||||
# Equality test -- ignore the nfa_set instance variable
|
||||
assert isinstance(other, DFAState)
|
||||
if self.is_final != other.is_final:
|
||||
return False
|
||||
# Can't just return self.arcs == other.arcs, because that
|
||||
# would invoke this method recursively, with cycles...
|
||||
if len(self.arcs) != len(other.arcs):
|
||||
return False
|
||||
for label, next_ in self.arcs.items():
|
||||
if next_ is not other.arcs.get(label):
|
||||
return False
|
||||
return True
|
||||
|
||||
__hash__ = None # For Py3 compatibility.
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s: %s is_final=%s>' % (
|
||||
self.__class__.__name__, self.from_rule, self.is_final
|
||||
)
|
||||
|
||||
|
||||
class ReservedString(object):
|
||||
"""
|
||||
Most grammars will have certain keywords and operators that are mentioned
|
||||
in the grammar as strings (e.g. "if") and not token types (e.g. NUMBER).
|
||||
This class basically is the former.
|
||||
"""
|
||||
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%s)' % (self.__class__.__name__, self.value)
|
||||
|
||||
|
||||
def _simplify_dfas(dfas):
|
||||
"""
|
||||
This is not theoretically optimal, but works well enough.
|
||||
Algorithm: repeatedly look for two states that have the same
|
||||
set of arcs (same labels pointing to the same nodes) and
|
||||
unify them, until things stop changing.
|
||||
|
||||
dfas is a list of DFAState instances
|
||||
"""
|
||||
changes = True
|
||||
while changes:
|
||||
changes = False
|
||||
for i, state_i in enumerate(dfas):
|
||||
for j in range(i + 1, len(dfas)):
|
||||
state_j = dfas[j]
|
||||
if state_i == state_j:
|
||||
#print " unify", i, j
|
||||
del dfas[j]
|
||||
for state in dfas:
|
||||
state.unifystate(state_j, state_i)
|
||||
changes = True
|
||||
break
|
||||
|
||||
|
||||
def _make_dfas(start, finish):
|
||||
"""
|
||||
Uses the powerset construction algorithm to create DFA states from sets of
|
||||
NFA states.
|
||||
|
||||
Also does state reduction if some states are not needed.
|
||||
"""
|
||||
# To turn an NFA into a DFA, we define the states of the DFA
|
||||
# to correspond to *sets* of states of the NFA. Then do some
|
||||
# state reduction.
|
||||
assert isinstance(start, NFAState)
|
||||
assert isinstance(finish, NFAState)
|
||||
|
||||
def addclosure(nfa_state, base_nfa_set):
|
||||
assert isinstance(nfa_state, NFAState)
|
||||
if nfa_state in base_nfa_set:
|
||||
return
|
||||
base_nfa_set.add(nfa_state)
|
||||
for nfa_arc in nfa_state.arcs:
|
||||
if nfa_arc.nonterminal_or_string is None:
|
||||
addclosure(nfa_arc.next, base_nfa_set)
|
||||
|
||||
base_nfa_set = set()
|
||||
addclosure(start, base_nfa_set)
|
||||
states = [DFAState(start.from_rule, base_nfa_set, finish)]
|
||||
for state in states: # NB states grows while we're iterating
|
||||
arcs = {}
|
||||
# Find state transitions and store them in arcs.
|
||||
for nfa_state in state.nfa_set:
|
||||
for nfa_arc in nfa_state.arcs:
|
||||
if nfa_arc.nonterminal_or_string is not None:
|
||||
nfa_set = arcs.setdefault(nfa_arc.nonterminal_or_string, set())
|
||||
addclosure(nfa_arc.next, nfa_set)
|
||||
|
||||
# Now create the dfa's with no None's in arcs anymore. All Nones have
|
||||
# been eliminated and state transitions (arcs) are properly defined, we
|
||||
# just need to create the dfa's.
|
||||
for nonterminal_or_string, nfa_set in arcs.items():
|
||||
for nested_state in states:
|
||||
if nested_state.nfa_set == nfa_set:
|
||||
# The DFA state already exists for this rule.
|
||||
break
|
||||
else:
|
||||
nested_state = DFAState(start.from_rule, nfa_set, finish)
|
||||
states.append(nested_state)
|
||||
|
||||
state.add_arc(nested_state, nonterminal_or_string)
|
||||
return states # List of DFAState instances; first one is start
|
||||
|
||||
|
||||
def _dump_nfa(start, finish):
|
||||
print("Dump of NFA for", start.from_rule)
|
||||
todo = [start]
|
||||
for i, state in enumerate(todo):
|
||||
print(" State", i, state is finish and "(final)" or "")
|
||||
for label, next_ in state.arcs:
|
||||
if next_ in todo:
|
||||
j = todo.index(next_)
|
||||
else:
|
||||
j = len(todo)
|
||||
todo.append(next_)
|
||||
if label is None:
|
||||
print(" -> %d" % j)
|
||||
else:
|
||||
print(" %s -> %d" % (label, j))
|
||||
|
||||
|
||||
def _dump_dfas(dfas):
|
||||
print("Dump of DFA for", dfas[0].from_rule)
|
||||
for i, state in enumerate(dfas):
|
||||
print(" State", i, state.is_final and "(final)" or "")
|
||||
for nonterminal, next_ in state.arcs.items():
|
||||
print(" %s -> %d" % (nonterminal, dfas.index(next_)))
|
||||
|
||||
|
||||
def generate_grammar(bnf_grammar, token_namespace):
|
||||
"""
|
||||
``bnf_text`` is a grammar in extended BNF (using * for repetition, + for
|
||||
at-least-once repetition, [] for optional parts, | for alternatives and ()
|
||||
for grouping).
|
||||
|
||||
It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its
|
||||
own parser.
|
||||
"""
|
||||
rule_to_dfas = {}
|
||||
start_nonterminal = None
|
||||
for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse():
|
||||
#_dump_nfa(a, z)
|
||||
dfas = _make_dfas(nfa_a, nfa_z)
|
||||
#_dump_dfas(dfas)
|
||||
# oldlen = len(dfas)
|
||||
_simplify_dfas(dfas)
|
||||
# newlen = len(dfas)
|
||||
rule_to_dfas[nfa_a.from_rule] = dfas
|
||||
#print(nfa_a.from_rule, oldlen, newlen)
|
||||
|
||||
if start_nonterminal is None:
|
||||
start_nonterminal = nfa_a.from_rule
|
||||
|
||||
reserved_strings = {}
|
||||
for nonterminal, dfas in rule_to_dfas.items():
|
||||
for dfa_state in dfas:
|
||||
for terminal_or_nonterminal, next_dfa in dfa_state.arcs.items():
|
||||
if terminal_or_nonterminal in rule_to_dfas:
|
||||
dfa_state.nonterminal_arcs[terminal_or_nonterminal] = next_dfa
|
||||
else:
|
||||
transition = _make_transition(
|
||||
token_namespace,
|
||||
reserved_strings,
|
||||
terminal_or_nonterminal
|
||||
)
|
||||
dfa_state.transitions[transition] = DFAPlan(next_dfa)
|
||||
|
||||
_calculate_tree_traversal(rule_to_dfas)
|
||||
return Grammar(start_nonterminal, rule_to_dfas, reserved_strings)
|
||||
|
||||
|
||||
def _make_transition(token_namespace, reserved_syntax_strings, label):
|
||||
"""
|
||||
Creates a reserved string ("if", "for", "*", ...) or returns the token type
|
||||
(NUMBER, STRING, ...) for a given grammar terminal.
|
||||
"""
|
||||
if label[0].isalpha():
|
||||
# A named token (e.g. NAME, NUMBER, STRING)
|
||||
return getattr(token_namespace, label)
|
||||
else:
|
||||
# Either a keyword or an operator
|
||||
assert label[0] in ('"', "'"), label
|
||||
assert not label.startswith('"""') and not label.startswith("'''")
|
||||
# TODO use literal_eval instead of a simple eval.
|
||||
value = literal_eval(label)
|
||||
try:
|
||||
return reserved_syntax_strings[value]
|
||||
except KeyError:
|
||||
r = reserved_syntax_strings[value] = ReservedString(value)
|
||||
return r
|
||||
|
||||
|
||||
def _calculate_tree_traversal(nonterminal_to_dfas):
|
||||
"""
|
||||
By this point we know how dfas can move around within a stack node, but we
|
||||
don't know how we can add a new stack node (nonterminal transitions).
|
||||
"""
|
||||
# Map from grammar rule (nonterminal) name to a set of tokens.
|
||||
first_plans = {}
|
||||
|
||||
nonterminals = list(nonterminal_to_dfas.keys())
|
||||
nonterminals.sort()
|
||||
for nonterminal in nonterminals:
|
||||
if nonterminal not in first_plans:
|
||||
_calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal)
|
||||
|
||||
# Now that we have calculated the first terminals, we are sure that
|
||||
# there is no left recursion or ambiguities.
|
||||
|
||||
for dfas in nonterminal_to_dfas.values():
|
||||
for dfa_state in dfas:
|
||||
for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items():
|
||||
for transition, pushes in first_plans[nonterminal].items():
|
||||
dfa_state.transitions[transition] = DFAPlan(next_dfa, pushes)
|
||||
|
||||
|
||||
def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal):
|
||||
"""
|
||||
Calculates the first plan in the first_plans dictionary for every given
|
||||
nonterminal. This is going to be used to know when to create stack nodes.
|
||||
"""
|
||||
dfas = nonterminal_to_dfas[nonterminal]
|
||||
new_first_plans = {}
|
||||
first_plans[nonterminal] = None # dummy to detect left recursion
|
||||
# We only need to check the first dfa. All the following ones are not
|
||||
# interesting to find first terminals.
|
||||
state = dfas[0]
|
||||
for transition, next_ in state.transitions.items():
|
||||
# It's a string. We have finally found a possible first token.
|
||||
new_first_plans[transition] = [next_.next_dfa]
|
||||
|
||||
for nonterminal2, next_ in state.nonterminal_arcs.items():
|
||||
# It's a nonterminal and we have either a left recursion issue
|
||||
# in the grammar or we have to recurse.
|
||||
try:
|
||||
first_plans2 = first_plans[nonterminal2]
|
||||
except KeyError:
|
||||
first_plans2 = _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal2)
|
||||
else:
|
||||
if first_plans2 is None:
|
||||
raise ValueError("left recursion for rule %r" % nonterminal)
|
||||
|
||||
for t, pushes in first_plans2.items():
|
||||
check = new_first_plans.get(t)
|
||||
if check is not None:
|
||||
raise ValueError(
|
||||
"Rule %s is ambiguous; %s is the"
|
||||
" start of the rule %s as well as %s."
|
||||
% (nonterminal, t, nonterminal2, check[-1].from_rule)
|
||||
)
|
||||
new_first_plans[t] = [next_] + pushes
|
||||
|
||||
first_plans[nonterminal] = new_first_plans
|
||||
return new_first_plans
|
||||
@@ -1,128 +0,0 @@
|
||||
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
# Modifications:
|
||||
# Copyright 2014 David Halter. Integration into Jedi.
|
||||
# Modifications are dual-licensed: MIT and PSF.
|
||||
|
||||
"""This module defines the data structures used to represent a grammar.
|
||||
|
||||
These are a bit arcane because they are derived from the data
|
||||
structures used by Python's 'pgen' parser generator.
|
||||
|
||||
There's also a table here mapping operators to their names in the
|
||||
token module; the Python tokenize module reports all operators as the
|
||||
fallback token code OP, but the parser needs the actual token code.
|
||||
|
||||
"""
|
||||
|
||||
try:
|
||||
import cPickle as pickle
|
||||
except:
|
||||
import pickle
|
||||
|
||||
|
||||
class Grammar(object):
|
||||
"""Pgen parsing tables conversion class.
|
||||
|
||||
Once initialized, this class supplies the grammar tables for the
|
||||
parsing engine implemented by parse.py. The parsing engine
|
||||
accesses the instance variables directly. The class here does not
|
||||
provide initialization of the tables; several subclasses exist to
|
||||
do this (see the conv and pgen modules).
|
||||
|
||||
The load() method reads the tables from a pickle file, which is
|
||||
much faster than the other ways offered by subclasses. The pickle
|
||||
file is written by calling dump() (after loading the grammar
|
||||
tables using a subclass). The report() method prints a readable
|
||||
representation of the tables to stdout, for debugging.
|
||||
|
||||
The instance variables are as follows:
|
||||
|
||||
symbol2number -- a dict mapping symbol names to numbers. Symbol
|
||||
numbers are always 256 or higher, to distinguish
|
||||
them from token numbers, which are between 0 and
|
||||
255 (inclusive).
|
||||
|
||||
number2symbol -- a dict mapping numbers to symbol names;
|
||||
these two are each other's inverse.
|
||||
|
||||
states -- a list of DFAs, where each DFA is a list of
|
||||
states, each state is a list of arcs, and each
|
||||
arc is a (i, j) pair where i is a label and j is
|
||||
a state number. The DFA number is the index into
|
||||
this list. (This name is slightly confusing.)
|
||||
Final states are represented by a special arc of
|
||||
the form (0, j) where j is its own state number.
|
||||
|
||||
dfas -- a dict mapping symbol numbers to (DFA, first)
|
||||
pairs, where DFA is an item from the states list
|
||||
above, and first is a set of tokens that can
|
||||
begin this grammar rule (represented by a dict
|
||||
whose values are always 1).
|
||||
|
||||
labels -- a list of (x, y) pairs where x is either a token
|
||||
number or a symbol number, and y is either None
|
||||
or a string; the strings are keywords. The label
|
||||
number is the index in this list; label numbers
|
||||
are used to mark state transitions (arcs) in the
|
||||
DFAs.
|
||||
|
||||
start -- the number of the grammar's start symbol.
|
||||
|
||||
keywords -- a dict mapping keyword strings to arc labels.
|
||||
|
||||
tokens -- a dict mapping token numbers to arc labels.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, bnf_text):
|
||||
self.symbol2number = {}
|
||||
self.number2symbol = {}
|
||||
self.states = []
|
||||
self.dfas = {}
|
||||
self.labels = [(0, "EMPTY")]
|
||||
self.keywords = {}
|
||||
self.tokens = {}
|
||||
self.symbol2label = {}
|
||||
self.label2symbol = {}
|
||||
self.start = 256
|
||||
|
||||
def dump(self, filename):
|
||||
"""Dump the grammar tables to a pickle file."""
|
||||
with open(filename, "wb") as f:
|
||||
pickle.dump(self.__dict__, f, 2)
|
||||
|
||||
def load(self, filename):
|
||||
"""Load the grammar tables from a pickle file."""
|
||||
with open(filename, "rb") as f:
|
||||
d = pickle.load(f)
|
||||
self.__dict__.update(d)
|
||||
|
||||
def copy(self):
|
||||
"""
|
||||
Copy the grammar.
|
||||
"""
|
||||
new = self.__class__()
|
||||
for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
|
||||
"tokens", "symbol2label"):
|
||||
setattr(new, dict_attr, getattr(self, dict_attr).copy())
|
||||
new.labels = self.labels[:]
|
||||
new.states = self.states[:]
|
||||
new.start = self.start
|
||||
return new
|
||||
|
||||
def report(self):
|
||||
"""Dump the grammar tables to standard output, for debugging."""
|
||||
from pprint import pprint
|
||||
print("s2n")
|
||||
pprint(self.symbol2number)
|
||||
print("n2s")
|
||||
pprint(self.number2symbol)
|
||||
print("states")
|
||||
pprint(self.states)
|
||||
print("dfas")
|
||||
pprint(self.dfas)
|
||||
print("labels")
|
||||
pprint(self.labels)
|
||||
print("start", self.start)
|
||||
156
parso/pgen2/grammar_parser.py
Normal file
156
parso/pgen2/grammar_parser.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
# Modifications:
|
||||
# Copyright David Halter and Contributors
|
||||
# Modifications are dual-licensed: MIT and PSF.
|
||||
|
||||
from parso.python.tokenize import tokenize
|
||||
from parso.utils import parse_version_string
|
||||
from parso.python.token import PythonTokenTypes
|
||||
|
||||
|
||||
class GrammarParser():
|
||||
"""
|
||||
The parser for Python grammar files.
|
||||
"""
|
||||
def __init__(self, bnf_grammar):
|
||||
self._bnf_grammar = bnf_grammar
|
||||
self.generator = tokenize(
|
||||
bnf_grammar,
|
||||
version_info=parse_version_string('3.6')
|
||||
)
|
||||
self._gettoken() # Initialize lookahead
|
||||
|
||||
def parse(self):
|
||||
# grammar: (NEWLINE | rule)* ENDMARKER
|
||||
while self.type != PythonTokenTypes.ENDMARKER:
|
||||
while self.type == PythonTokenTypes.NEWLINE:
|
||||
self._gettoken()
|
||||
|
||||
# rule: NAME ':' rhs NEWLINE
|
||||
self._current_rule_name = self._expect(PythonTokenTypes.NAME)
|
||||
self._expect(PythonTokenTypes.OP, ':')
|
||||
|
||||
a, z = self._parse_rhs()
|
||||
self._expect(PythonTokenTypes.NEWLINE)
|
||||
|
||||
yield a, z
|
||||
|
||||
def _parse_rhs(self):
|
||||
# rhs: items ('|' items)*
|
||||
a, z = self._parse_items()
|
||||
if self.value != "|":
|
||||
return a, z
|
||||
else:
|
||||
aa = NFAState(self._current_rule_name)
|
||||
zz = NFAState(self._current_rule_name)
|
||||
while True:
|
||||
# Add the possibility to go into the state of a and come back
|
||||
# to finish.
|
||||
aa.add_arc(a)
|
||||
z.add_arc(zz)
|
||||
if self.value != "|":
|
||||
break
|
||||
|
||||
self._gettoken()
|
||||
a, z = self._parse_items()
|
||||
return aa, zz
|
||||
|
||||
def _parse_items(self):
|
||||
# items: item+
|
||||
a, b = self._parse_item()
|
||||
while self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING) \
|
||||
or self.value in ('(', '['):
|
||||
c, d = self._parse_item()
|
||||
# Need to end on the next item.
|
||||
b.add_arc(c)
|
||||
b = d
|
||||
return a, b
|
||||
|
||||
def _parse_item(self):
|
||||
# item: '[' rhs ']' | atom ['+' | '*']
|
||||
if self.value == "[":
|
||||
self._gettoken()
|
||||
a, z = self._parse_rhs()
|
||||
self._expect(PythonTokenTypes.OP, ']')
|
||||
# Make it also possible that there is no token and change the
|
||||
# state.
|
||||
a.add_arc(z)
|
||||
return a, z
|
||||
else:
|
||||
a, z = self._parse_atom()
|
||||
value = self.value
|
||||
if value not in ("+", "*"):
|
||||
return a, z
|
||||
self._gettoken()
|
||||
# Make it clear that we can go back to the old state and repeat.
|
||||
z.add_arc(a)
|
||||
if value == "+":
|
||||
return a, z
|
||||
else:
|
||||
# The end state is the same as the beginning, nothing must
|
||||
# change.
|
||||
return a, a
|
||||
|
||||
def _parse_atom(self):
|
||||
# atom: '(' rhs ')' | NAME | STRING
|
||||
if self.value == "(":
|
||||
self._gettoken()
|
||||
a, z = self._parse_rhs()
|
||||
self._expect(PythonTokenTypes.OP, ')')
|
||||
return a, z
|
||||
elif self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING):
|
||||
a = NFAState(self._current_rule_name)
|
||||
z = NFAState(self._current_rule_name)
|
||||
# Make it clear that the state transition requires that value.
|
||||
a.add_arc(z, self.value)
|
||||
self._gettoken()
|
||||
return a, z
|
||||
else:
|
||||
self._raise_error("expected (...) or NAME or STRING, got %s/%s",
|
||||
self.type, self.value)
|
||||
|
||||
def _expect(self, type_, value=None):
|
||||
if self.type != type_:
|
||||
self._raise_error("expected %s, got %s [%s]",
|
||||
type_, self.type, self.value)
|
||||
if value is not None and self.value != value:
|
||||
self._raise_error("expected %s, got %s", value, self.value)
|
||||
value = self.value
|
||||
self._gettoken()
|
||||
return value
|
||||
|
||||
def _gettoken(self):
|
||||
tup = next(self.generator)
|
||||
self.type, self.value, self.begin, prefix = tup
|
||||
|
||||
def _raise_error(self, msg, *args):
|
||||
if args:
|
||||
try:
|
||||
msg = msg % args
|
||||
except:
|
||||
msg = " ".join([msg] + list(map(str, args)))
|
||||
line = self._bnf_grammar.splitlines()[self.begin[0] - 1]
|
||||
raise SyntaxError(msg, ('<grammar>', self.begin[0],
|
||||
self.begin[1], line))
|
||||
|
||||
|
||||
class NFAArc(object):
|
||||
def __init__(self, next_, nonterminal_or_string):
|
||||
self.next = next_
|
||||
self.nonterminal_or_string = nonterminal_or_string
|
||||
|
||||
|
||||
class NFAState(object):
|
||||
def __init__(self, from_rule):
|
||||
self.from_rule = from_rule
|
||||
self.arcs = [] # List[nonterminal (str), NFAState]
|
||||
|
||||
def add_arc(self, next_, nonterminal_or_string=None):
|
||||
assert nonterminal_or_string is None or isinstance(nonterminal_or_string, str)
|
||||
assert isinstance(next_, NFAState)
|
||||
self.arcs.append(NFAArc(next_, nonterminal_or_string))
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s: from %s>' % (self.__class__.__name__, self.from_rule)
|
||||
@@ -1,223 +0,0 @@
|
||||
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
# Modifications:
|
||||
# Copyright 2014 David Halter. Integration into Jedi.
|
||||
# Modifications are dual-licensed: MIT and PSF.
|
||||
|
||||
"""
|
||||
Parser engine for the grammar tables generated by pgen.
|
||||
|
||||
The grammar table must be loaded first.
|
||||
|
||||
See Parser/parser.c in the Python distribution for additional info on
|
||||
how this parsing engine works.
|
||||
"""
|
||||
|
||||
from parso.python import tokenize
|
||||
|
||||
|
||||
class InternalParseError(Exception):
|
||||
"""
|
||||
Exception to signal the parser is stuck and error recovery didn't help.
|
||||
Basically this shouldn't happen. It's a sign that something is really
|
||||
wrong.
|
||||
"""
|
||||
|
||||
def __init__(self, msg, type, value, start_pos):
|
||||
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
|
||||
(msg, tokenize.tok_name[type], value, start_pos))
|
||||
self.msg = msg
|
||||
self.type = type
|
||||
self.value = value
|
||||
self.start_pos = start_pos
|
||||
|
||||
|
||||
class Stack(list):
|
||||
def get_tos_nodes(self):
|
||||
tos = self[-1]
|
||||
return tos[2][1]
|
||||
|
||||
|
||||
def token_to_ilabel(grammar, type_, value):
|
||||
# Map from token to label
|
||||
if type_ == tokenize.NAME:
|
||||
# Check for reserved words (keywords)
|
||||
try:
|
||||
return grammar.keywords[value]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
try:
|
||||
return grammar.tokens[type_]
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
|
||||
class PgenParser(object):
|
||||
"""Parser engine.
|
||||
|
||||
The proper usage sequence is:
|
||||
|
||||
p = Parser(grammar, [converter]) # create instance
|
||||
p.setup([start]) # prepare for parsing
|
||||
<for each input token>:
|
||||
if p.add_token(...): # parse a token
|
||||
break
|
||||
root = p.rootnode # root of abstract syntax tree
|
||||
|
||||
A Parser instance may be reused by calling setup() repeatedly.
|
||||
|
||||
A Parser instance contains state pertaining to the current token
|
||||
sequence, and should not be used concurrently by different threads
|
||||
to parse separate token sequences.
|
||||
|
||||
See driver.py for how to get input tokens by tokenizing a file or
|
||||
string.
|
||||
|
||||
Parsing is complete when add_token() returns True; the root of the
|
||||
abstract syntax tree can then be retrieved from the rootnode
|
||||
instance variable. When a syntax error occurs, error_recovery()
|
||||
is called. There is no error recovery; the parser cannot be used
|
||||
after a syntax error was reported (but it can be reinitialized by
|
||||
calling setup()).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, convert_node, convert_leaf, error_recovery, start):
|
||||
"""Constructor.
|
||||
|
||||
The grammar argument is a grammar.Grammar instance; see the
|
||||
grammar module for more information.
|
||||
|
||||
The parser is not ready yet for parsing; you must call the
|
||||
setup() method to get it started.
|
||||
|
||||
The optional convert argument is a function mapping concrete
|
||||
syntax tree nodes to abstract syntax tree nodes. If not
|
||||
given, no conversion is done and the syntax tree produced is
|
||||
the concrete syntax tree. If given, it must be a function of
|
||||
two arguments, the first being the grammar (a grammar.Grammar
|
||||
instance), and the second being the concrete syntax tree node
|
||||
to be converted. The syntax tree is converted from the bottom
|
||||
up.
|
||||
|
||||
A concrete syntax tree node is a (type, nodes) tuple, where
|
||||
type is the node type (a token or symbol number) and nodes
|
||||
is a list of children for symbols, and None for tokens.
|
||||
|
||||
An abstract syntax tree node may be anything; this is entirely
|
||||
up to the converter function.
|
||||
|
||||
"""
|
||||
self.grammar = grammar
|
||||
self.convert_node = convert_node
|
||||
self.convert_leaf = convert_leaf
|
||||
|
||||
# Each stack entry is a tuple: (dfa, state, node).
|
||||
# A node is a tuple: (type, children),
|
||||
# where children is a list of nodes or None
|
||||
newnode = (start, [])
|
||||
stackentry = (self.grammar.dfas[start], 0, newnode)
|
||||
self.stack = Stack([stackentry])
|
||||
self.rootnode = None
|
||||
self.error_recovery = error_recovery
|
||||
|
||||
def parse(self, tokens):
|
||||
for type_, value, start_pos, prefix in tokens:
|
||||
if self.add_token(type_, value, start_pos, prefix):
|
||||
break
|
||||
else:
|
||||
# We never broke out -- EOF is too soon -- Unfinished statement.
|
||||
# However, the error recovery might have added the token again, if
|
||||
# the stack is empty, we're fine.
|
||||
if self.stack:
|
||||
raise InternalParseError("incomplete input", type_, value, start_pos)
|
||||
return self.rootnode
|
||||
|
||||
def add_token(self, type_, value, start_pos, prefix):
|
||||
"""Add a token; return True if this is the end of the program."""
|
||||
ilabel = token_to_ilabel(self.grammar, type_, value)
|
||||
|
||||
# Loop until the token is shifted; may raise exceptions
|
||||
_gram = self.grammar
|
||||
_labels = _gram.labels
|
||||
_push = self._push
|
||||
_pop = self._pop
|
||||
_shift = self._shift
|
||||
while True:
|
||||
dfa, state, node = self.stack[-1]
|
||||
states, first = dfa
|
||||
arcs = states[state]
|
||||
# Look for a state with this label
|
||||
for i, newstate in arcs:
|
||||
t, v = _labels[i]
|
||||
if ilabel == i:
|
||||
# Look it up in the list of labels
|
||||
assert t < 256
|
||||
# Shift a token; we're done with it
|
||||
_shift(type_, value, newstate, prefix, start_pos)
|
||||
# Pop while we are in an accept-only state
|
||||
state = newstate
|
||||
while states[state] == [(0, state)]:
|
||||
_pop()
|
||||
if not self.stack:
|
||||
# Done parsing!
|
||||
return True
|
||||
dfa, state, node = self.stack[-1]
|
||||
states, first = dfa
|
||||
# Done with this token
|
||||
return False
|
||||
elif t >= 256:
|
||||
# See if it's a symbol and if we're in its first set
|
||||
itsdfa = _gram.dfas[t]
|
||||
itsstates, itsfirst = itsdfa
|
||||
if ilabel in itsfirst:
|
||||
# Push a symbol
|
||||
_push(t, itsdfa, newstate)
|
||||
break # To continue the outer while loop
|
||||
else:
|
||||
if (0, state) in arcs:
|
||||
# An accepting state, pop it and try something else
|
||||
_pop()
|
||||
if not self.stack:
|
||||
# Done parsing, but another token is input
|
||||
raise InternalParseError("too much input", type_, value, start_pos)
|
||||
else:
|
||||
self.error_recovery(self.grammar, self.stack, arcs, type_,
|
||||
value, start_pos, prefix, self.add_token)
|
||||
break
|
||||
|
||||
def _shift(self, type_, value, newstate, prefix, start_pos):
|
||||
"""Shift a token. (Internal)"""
|
||||
dfa, state, node = self.stack[-1]
|
||||
newnode = self.convert_leaf(self.grammar, type_, value, prefix, start_pos)
|
||||
node[-1].append(newnode)
|
||||
self.stack[-1] = (dfa, newstate, node)
|
||||
|
||||
def _push(self, type_, newdfa, newstate):
|
||||
"""Push a nonterminal. (Internal)"""
|
||||
dfa, state, node = self.stack[-1]
|
||||
newnode = (type_, [])
|
||||
self.stack[-1] = (dfa, newstate, node)
|
||||
self.stack.append((newdfa, 0, newnode))
|
||||
|
||||
def _pop(self):
|
||||
"""Pop a nonterminal. (Internal)"""
|
||||
popdfa, popstate, (type_, children) = self.stack.pop()
|
||||
# If there's exactly one child, return that child instead of creating a
|
||||
# new node. We still create expr_stmt and file_input though, because a
|
||||
# lot of Jedi depends on its logic.
|
||||
if len(children) == 1:
|
||||
newnode = children[0]
|
||||
else:
|
||||
newnode = self.convert_node(self.grammar, type_, children)
|
||||
|
||||
try:
|
||||
# Equal to:
|
||||
# dfa, state, node = self.stack[-1]
|
||||
# symbol, children = node
|
||||
self.stack[-1][2][1].append(newnode)
|
||||
except IndexError:
|
||||
# Stack is empty, set the rootnode.
|
||||
self.rootnode = newnode
|
||||
@@ -1,400 +0,0 @@
|
||||
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
# Modifications:
|
||||
# Copyright 2014 David Halter. Integration into Jedi.
|
||||
# Modifications are dual-licensed: MIT and PSF.
|
||||
|
||||
from parso.pgen2 import grammar
|
||||
from parso.python import token
|
||||
from parso.python import tokenize
|
||||
from parso.utils import parse_version_string
|
||||
|
||||
|
||||
class ParserGenerator(object):
|
||||
def __init__(self, bnf_text, token_namespace):
|
||||
self._bnf_text = bnf_text
|
||||
self.generator = tokenize.tokenize(
|
||||
bnf_text,
|
||||
version_info=parse_version_string('3.6')
|
||||
)
|
||||
self._gettoken() # Initialize lookahead
|
||||
self.dfas, self.startsymbol = self._parse()
|
||||
self.first = {} # map from symbol name to set of tokens
|
||||
self._addfirstsets()
|
||||
self._token_namespace = token_namespace
|
||||
|
||||
def make_grammar(self):
|
||||
c = grammar.Grammar(self._bnf_text)
|
||||
names = list(self.dfas.keys())
|
||||
names.sort()
|
||||
# TODO do we still need this?
|
||||
names.remove(self.startsymbol)
|
||||
names.insert(0, self.startsymbol)
|
||||
for name in names:
|
||||
i = 256 + len(c.symbol2number)
|
||||
c.symbol2number[name] = i
|
||||
c.number2symbol[i] = name
|
||||
for name in names:
|
||||
dfa = self.dfas[name]
|
||||
states = []
|
||||
for state in dfa:
|
||||
arcs = []
|
||||
for label, next in state.arcs.items():
|
||||
arcs.append((self._make_label(c, label), dfa.index(next)))
|
||||
if state.isfinal:
|
||||
arcs.append((0, dfa.index(state)))
|
||||
states.append(arcs)
|
||||
c.states.append(states)
|
||||
c.dfas[c.symbol2number[name]] = (states, self._make_first(c, name))
|
||||
c.start = c.symbol2number[self.startsymbol]
|
||||
return c
|
||||
|
||||
def _make_first(self, c, name):
|
||||
rawfirst = self.first[name]
|
||||
first = {}
|
||||
for label in rawfirst:
|
||||
ilabel = self._make_label(c, label)
|
||||
##assert ilabel not in first # XXX failed on <> ... !=
|
||||
first[ilabel] = 1
|
||||
return first
|
||||
|
||||
def _make_label(self, c, label):
|
||||
# XXX Maybe this should be a method on a subclass of converter?
|
||||
ilabel = len(c.labels)
|
||||
if label[0].isalpha():
|
||||
# Either a symbol name or a named token
|
||||
if label in c.symbol2number:
|
||||
# A symbol name (a non-terminal)
|
||||
if label in c.symbol2label:
|
||||
return c.symbol2label[label]
|
||||
else:
|
||||
c.labels.append((c.symbol2number[label], None))
|
||||
c.symbol2label[label] = ilabel
|
||||
c.label2symbol[ilabel] = label
|
||||
return ilabel
|
||||
else:
|
||||
# A named token (NAME, NUMBER, STRING)
|
||||
itoken = getattr(self._token_namespace, label, None)
|
||||
assert isinstance(itoken, int), label
|
||||
if itoken in c.tokens:
|
||||
return c.tokens[itoken]
|
||||
else:
|
||||
c.labels.append((itoken, None))
|
||||
c.tokens[itoken] = ilabel
|
||||
return ilabel
|
||||
else:
|
||||
# Either a keyword or an operator
|
||||
assert label[0] in ('"', "'"), label
|
||||
value = eval(label)
|
||||
if value[0].isalpha():
|
||||
# A keyword
|
||||
if value in c.keywords:
|
||||
return c.keywords[value]
|
||||
else:
|
||||
# TODO this might be an issue?! Using token.NAME here?
|
||||
c.labels.append((token.NAME, value))
|
||||
c.keywords[value] = ilabel
|
||||
return ilabel
|
||||
else:
|
||||
# An operator (any non-numeric token)
|
||||
itoken = self._token_namespace.generate_token_id(value)
|
||||
if itoken in c.tokens:
|
||||
return c.tokens[itoken]
|
||||
else:
|
||||
c.labels.append((itoken, None))
|
||||
c.tokens[itoken] = ilabel
|
||||
return ilabel
|
||||
|
||||
def _addfirstsets(self):
|
||||
names = list(self.dfas.keys())
|
||||
names.sort()
|
||||
for name in names:
|
||||
if name not in self.first:
|
||||
self._calcfirst(name)
|
||||
#print name, self.first[name].keys()
|
||||
|
||||
def _calcfirst(self, name):
|
||||
dfa = self.dfas[name]
|
||||
self.first[name] = None # dummy to detect left recursion
|
||||
state = dfa[0]
|
||||
totalset = {}
|
||||
overlapcheck = {}
|
||||
for label, next in state.arcs.items():
|
||||
if label in self.dfas:
|
||||
if label in self.first:
|
||||
fset = self.first[label]
|
||||
if fset is None:
|
||||
raise ValueError("recursion for rule %r" % name)
|
||||
else:
|
||||
self._calcfirst(label)
|
||||
fset = self.first[label]
|
||||
totalset.update(fset)
|
||||
overlapcheck[label] = fset
|
||||
else:
|
||||
totalset[label] = 1
|
||||
overlapcheck[label] = {label: 1}
|
||||
inverse = {}
|
||||
for label, itsfirst in overlapcheck.items():
|
||||
for symbol in itsfirst:
|
||||
if symbol in inverse:
|
||||
raise ValueError("rule %s is ambiguous; %s is in the"
|
||||
" first sets of %s as well as %s" %
|
||||
(name, symbol, label, inverse[symbol]))
|
||||
inverse[symbol] = label
|
||||
self.first[name] = totalset
|
||||
|
||||
def _parse(self):
|
||||
dfas = {}
|
||||
startsymbol = None
|
||||
# MSTART: (NEWLINE | RULE)* ENDMARKER
|
||||
while self.type != token.ENDMARKER:
|
||||
while self.type == token.NEWLINE:
|
||||
self._gettoken()
|
||||
# RULE: NAME ':' RHS NEWLINE
|
||||
name = self._expect(token.NAME)
|
||||
self._expect(token.COLON)
|
||||
a, z = self._parse_rhs()
|
||||
self._expect(token.NEWLINE)
|
||||
#self._dump_nfa(name, a, z)
|
||||
dfa = self._make_dfa(a, z)
|
||||
#self._dump_dfa(name, dfa)
|
||||
# oldlen = len(dfa)
|
||||
self._simplify_dfa(dfa)
|
||||
# newlen = len(dfa)
|
||||
dfas[name] = dfa
|
||||
#print name, oldlen, newlen
|
||||
if startsymbol is None:
|
||||
startsymbol = name
|
||||
return dfas, startsymbol
|
||||
|
||||
def _make_dfa(self, start, finish):
|
||||
# To turn an NFA into a DFA, we define the states of the DFA
|
||||
# to correspond to *sets* of states of the NFA. Then do some
|
||||
# state reduction. Let's represent sets as dicts with 1 for
|
||||
# values.
|
||||
assert isinstance(start, NFAState)
|
||||
assert isinstance(finish, NFAState)
|
||||
|
||||
def closure(state):
|
||||
base = {}
|
||||
addclosure(state, base)
|
||||
return base
|
||||
|
||||
def addclosure(state, base):
|
||||
assert isinstance(state, NFAState)
|
||||
if state in base:
|
||||
return
|
||||
base[state] = 1
|
||||
for label, next in state.arcs:
|
||||
if label is None:
|
||||
addclosure(next, base)
|
||||
|
||||
states = [DFAState(closure(start), finish)]
|
||||
for state in states: # NB states grows while we're iterating
|
||||
arcs = {}
|
||||
for nfastate in state.nfaset:
|
||||
for label, next in nfastate.arcs:
|
||||
if label is not None:
|
||||
addclosure(next, arcs.setdefault(label, {}))
|
||||
for label, nfaset in arcs.items():
|
||||
for st in states:
|
||||
if st.nfaset == nfaset:
|
||||
break
|
||||
else:
|
||||
st = DFAState(nfaset, finish)
|
||||
states.append(st)
|
||||
state.addarc(st, label)
|
||||
return states # List of DFAState instances; first one is start
|
||||
|
||||
def _dump_nfa(self, name, start, finish):
|
||||
print("Dump of NFA for", name)
|
||||
todo = [start]
|
||||
for i, state in enumerate(todo):
|
||||
print(" State", i, state is finish and "(final)" or "")
|
||||
for label, next in state.arcs:
|
||||
if next in todo:
|
||||
j = todo.index(next)
|
||||
else:
|
||||
j = len(todo)
|
||||
todo.append(next)
|
||||
if label is None:
|
||||
print(" -> %d" % j)
|
||||
else:
|
||||
print(" %s -> %d" % (label, j))
|
||||
|
||||
def _dump_dfa(self, name, dfa):
|
||||
print("Dump of DFA for", name)
|
||||
for i, state in enumerate(dfa):
|
||||
print(" State", i, state.isfinal and "(final)" or "")
|
||||
for label, next in state.arcs.items():
|
||||
print(" %s -> %d" % (label, dfa.index(next)))
|
||||
|
||||
def _simplify_dfa(self, dfa):
|
||||
# This is not theoretically optimal, but works well enough.
|
||||
# Algorithm: repeatedly look for two states that have the same
|
||||
# set of arcs (same labels pointing to the same nodes) and
|
||||
# unify them, until things stop changing.
|
||||
|
||||
# dfa is a list of DFAState instances
|
||||
changes = True
|
||||
while changes:
|
||||
changes = False
|
||||
for i, state_i in enumerate(dfa):
|
||||
for j in range(i + 1, len(dfa)):
|
||||
state_j = dfa[j]
|
||||
if state_i == state_j:
|
||||
#print " unify", i, j
|
||||
del dfa[j]
|
||||
for state in dfa:
|
||||
state.unifystate(state_j, state_i)
|
||||
changes = True
|
||||
break
|
||||
|
||||
def _parse_rhs(self):
|
||||
# RHS: ALT ('|' ALT)*
|
||||
a, z = self._parse_alt()
|
||||
if self.value != "|":
|
||||
return a, z
|
||||
else:
|
||||
aa = NFAState()
|
||||
zz = NFAState()
|
||||
aa.addarc(a)
|
||||
z.addarc(zz)
|
||||
while self.value == "|":
|
||||
self._gettoken()
|
||||
a, z = self._parse_alt()
|
||||
aa.addarc(a)
|
||||
z.addarc(zz)
|
||||
return aa, zz
|
||||
|
||||
def _parse_alt(self):
|
||||
# ALT: ITEM+
|
||||
a, b = self._parse_item()
|
||||
while (self.value in ("(", "[") or
|
||||
self.type in (token.NAME, token.STRING)):
|
||||
c, d = self._parse_item()
|
||||
b.addarc(c)
|
||||
b = d
|
||||
return a, b
|
||||
|
||||
def _parse_item(self):
|
||||
# ITEM: '[' RHS ']' | ATOM ['+' | '*']
|
||||
if self.value == "[":
|
||||
self._gettoken()
|
||||
a, z = self._parse_rhs()
|
||||
self._expect(token.RSQB)
|
||||
a.addarc(z)
|
||||
return a, z
|
||||
else:
|
||||
a, z = self._parse_atom()
|
||||
value = self.value
|
||||
if value not in ("+", "*"):
|
||||
return a, z
|
||||
self._gettoken()
|
||||
z.addarc(a)
|
||||
if value == "+":
|
||||
return a, z
|
||||
else:
|
||||
return a, a
|
||||
|
||||
def _parse_atom(self):
|
||||
# ATOM: '(' RHS ')' | NAME | STRING
|
||||
if self.value == "(":
|
||||
self._gettoken()
|
||||
a, z = self._parse_rhs()
|
||||
self._expect(token.RPAR)
|
||||
return a, z
|
||||
elif self.type in (token.NAME, token.STRING):
|
||||
a = NFAState()
|
||||
z = NFAState()
|
||||
a.addarc(z, self.value)
|
||||
self._gettoken()
|
||||
return a, z
|
||||
else:
|
||||
self._raise_error("expected (...) or NAME or STRING, got %s/%s",
|
||||
self.type, self.value)
|
||||
|
||||
def _expect(self, type):
|
||||
if self.type != type:
|
||||
self._raise_error("expected %s(%s), got %s(%s)",
|
||||
type, token.tok_name[type], self.type, self.value)
|
||||
value = self.value
|
||||
self._gettoken()
|
||||
return value
|
||||
|
||||
def _gettoken(self):
|
||||
tup = next(self.generator)
|
||||
while tup[0] in (token.COMMENT, token.NL):
|
||||
tup = next(self.generator)
|
||||
self.type, self.value, self.begin, prefix = tup
|
||||
|
||||
def _raise_error(self, msg, *args):
|
||||
if args:
|
||||
try:
|
||||
msg = msg % args
|
||||
except:
|
||||
msg = " ".join([msg] + list(map(str, args)))
|
||||
line = self._bnf_text.splitlines()[self.begin[0] - 1]
|
||||
raise SyntaxError(msg, ('<grammar>', self.begin[0],
|
||||
self.begin[1], line))
|
||||
|
||||
|
||||
class NFAState(object):
|
||||
def __init__(self):
|
||||
self.arcs = [] # list of (label, NFAState) pairs
|
||||
|
||||
def addarc(self, next, label=None):
|
||||
assert label is None or isinstance(label, str)
|
||||
assert isinstance(next, NFAState)
|
||||
self.arcs.append((label, next))
|
||||
|
||||
|
||||
class DFAState(object):
|
||||
def __init__(self, nfaset, final):
|
||||
assert isinstance(nfaset, dict)
|
||||
assert isinstance(next(iter(nfaset)), NFAState)
|
||||
assert isinstance(final, NFAState)
|
||||
self.nfaset = nfaset
|
||||
self.isfinal = final in nfaset
|
||||
self.arcs = {} # map from label to DFAState
|
||||
|
||||
def addarc(self, next, label):
|
||||
assert isinstance(label, str)
|
||||
assert label not in self.arcs
|
||||
assert isinstance(next, DFAState)
|
||||
self.arcs[label] = next
|
||||
|
||||
def unifystate(self, old, new):
|
||||
for label, next in self.arcs.items():
|
||||
if next is old:
|
||||
self.arcs[label] = new
|
||||
|
||||
def __eq__(self, other):
|
||||
# Equality test -- ignore the nfaset instance variable
|
||||
assert isinstance(other, DFAState)
|
||||
if self.isfinal != other.isfinal:
|
||||
return False
|
||||
# Can't just return self.arcs == other.arcs, because that
|
||||
# would invoke this method recursively, with cycles...
|
||||
if len(self.arcs) != len(other.arcs):
|
||||
return False
|
||||
for label, next in self.arcs.items():
|
||||
if next is not other.arcs.get(label):
|
||||
return False
|
||||
return True
|
||||
|
||||
__hash__ = None # For Py3 compatibility.
|
||||
|
||||
|
||||
def generate_grammar(bnf_text, token_namespace):
|
||||
"""
|
||||
``bnf_text`` is a grammar in extended BNF (using * for repetition, + for
|
||||
at-least-once repetition, [] for optional parts, | for alternatives and ()
|
||||
for grouping).
|
||||
|
||||
It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its
|
||||
own parser.
|
||||
"""
|
||||
p = ParserGenerator(bnf_text, token_namespace)
|
||||
return p.make_grammar()
|
||||
@@ -13,8 +13,8 @@ import logging
|
||||
from parso.utils import split_lines
|
||||
from parso.python.parser import Parser
|
||||
from parso.python.tree import EndMarker
|
||||
from parso.python.tokenize import (NEWLINE, PythonToken, ERROR_DEDENT,
|
||||
ENDMARKER, INDENT, DEDENT)
|
||||
from parso.python.tokenize import PythonToken
|
||||
from parso.python.token import PythonTokenTypes
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@@ -29,7 +29,7 @@ def _get_last_line(node_or_leaf):
|
||||
|
||||
def _ends_with_newline(leaf, suffix=''):
|
||||
if leaf.type == 'error_leaf':
|
||||
typ = leaf.original_type
|
||||
typ = leaf.token_type.lower()
|
||||
else:
|
||||
typ = leaf.type
|
||||
|
||||
@@ -41,9 +41,8 @@ def _flows_finished(pgen_grammar, stack):
|
||||
if, while, for and try might not be finished, because another part might
|
||||
still be parsed.
|
||||
"""
|
||||
for dfa, newstate, (symbol_number, nodes) in stack:
|
||||
if pgen_grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt',
|
||||
'for_stmt', 'try_stmt'):
|
||||
for stack_node in stack:
|
||||
if stack_node.nonterminal in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'):
|
||||
return False
|
||||
return True
|
||||
|
||||
@@ -52,10 +51,10 @@ def suite_or_file_input_is_valid(pgen_grammar, stack):
|
||||
if not _flows_finished(pgen_grammar, stack):
|
||||
return False
|
||||
|
||||
for dfa, newstate, (symbol_number, nodes) in reversed(stack):
|
||||
if pgen_grammar.number2symbol[symbol_number] == 'suite':
|
||||
for stack_node in reversed(stack):
|
||||
if stack_node.nonterminal == 'suite':
|
||||
# If only newline is in the suite, the suite is not valid, yet.
|
||||
return len(nodes) > 1
|
||||
return len(stack_node.nodes) > 1
|
||||
# Not reaching a suite means that we're dealing with file_input levels
|
||||
# where there's no need for a valid statement in it. It can also be empty.
|
||||
return True
|
||||
@@ -168,8 +167,7 @@ class DiffParser(object):
|
||||
|
||||
def _enabled_debugging(self, old_lines, lines_new):
|
||||
if self._module.get_code() != ''.join(lines_new):
|
||||
LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines),
|
||||
''.join(lines_new))
|
||||
LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new))
|
||||
|
||||
def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new):
|
||||
copied_nodes = [None]
|
||||
@@ -273,7 +271,6 @@ class DiffParser(object):
|
||||
# memoryview?
|
||||
parsed_until_line = self._nodes_stack.parsed_until_line
|
||||
lines_after = self._parser_lines_new[parsed_until_line:]
|
||||
#print('parse_content', parsed_until_line, lines_after, until_line)
|
||||
tokens = self._diff_tokenize(
|
||||
lines_after,
|
||||
until_line,
|
||||
@@ -290,10 +287,10 @@ class DiffParser(object):
|
||||
omitted_first_indent = False
|
||||
indents = []
|
||||
tokens = self._tokenizer(lines, (1, 0))
|
||||
stack = self._active_parser.pgen_parser.stack
|
||||
stack = self._active_parser.stack
|
||||
for typ, string, start_pos, prefix in tokens:
|
||||
start_pos = start_pos[0] + line_offset, start_pos[1]
|
||||
if typ == INDENT:
|
||||
if typ == PythonTokenTypes.INDENT:
|
||||
indents.append(start_pos[1])
|
||||
if is_first_token:
|
||||
omitted_first_indent = True
|
||||
@@ -306,8 +303,9 @@ class DiffParser(object):
|
||||
|
||||
# In case of omitted_first_indent, it might not be dedented fully.
|
||||
# However this is a sign for us that a dedent happened.
|
||||
if typ == DEDENT \
|
||||
or typ == ERROR_DEDENT and omitted_first_indent and len(indents) == 1:
|
||||
if typ == PythonTokenTypes.DEDENT \
|
||||
or typ == PythonTokenTypes.ERROR_DEDENT \
|
||||
and omitted_first_indent and len(indents) == 1:
|
||||
indents.pop()
|
||||
if omitted_first_indent and not indents:
|
||||
# We are done here, only thing that can come now is an
|
||||
@@ -317,18 +315,22 @@ class DiffParser(object):
|
||||
prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix)
|
||||
else:
|
||||
prefix = ''
|
||||
yield PythonToken(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix)
|
||||
yield PythonToken(
|
||||
PythonTokenTypes.ENDMARKER, '',
|
||||
(start_pos[0] + line_offset, 0),
|
||||
prefix
|
||||
)
|
||||
break
|
||||
elif typ == NEWLINE and start_pos[0] >= until_line:
|
||||
elif typ == PythonTokenTypes.NEWLINE and start_pos[0] >= until_line:
|
||||
yield PythonToken(typ, string, start_pos, prefix)
|
||||
# Check if the parser is actually in a valid suite state.
|
||||
if suite_or_file_input_is_valid(self._pgen_grammar, stack):
|
||||
start_pos = start_pos[0] + 1, 0
|
||||
while len(indents) > int(omitted_first_indent):
|
||||
indents.pop()
|
||||
yield PythonToken(DEDENT, '', start_pos, '')
|
||||
yield PythonToken(PythonTokenTypes.DEDENT, '', start_pos, '')
|
||||
|
||||
yield PythonToken(ENDMARKER, '', start_pos, '')
|
||||
yield PythonToken(PythonTokenTypes.ENDMARKER, '', start_pos, '')
|
||||
break
|
||||
else:
|
||||
continue
|
||||
@@ -490,6 +492,9 @@ class _NodesStack(object):
|
||||
|
||||
new_tos = tos
|
||||
for node in nodes:
|
||||
if node.start_pos[0] > until_line:
|
||||
break
|
||||
|
||||
if node.type == 'endmarker':
|
||||
# We basically removed the endmarker, but we are not allowed to
|
||||
# remove the newline at the end of the line, otherwise it's
|
||||
@@ -501,8 +506,6 @@ class _NodesStack(object):
|
||||
# Endmarkers just distort all the checks below. Remove them.
|
||||
break
|
||||
|
||||
if node.start_pos[0] > until_line:
|
||||
break
|
||||
# TODO this check might take a bit of time for large files. We
|
||||
# might want to change this to do more intelligent guessing or
|
||||
# binary search.
|
||||
@@ -536,7 +539,7 @@ class _NodesStack(object):
|
||||
line_offset_index = -2
|
||||
|
||||
elif (new_nodes[-1].type in ('error_leaf', 'error_node') or
|
||||
_is_flow_node(new_nodes[-1])):
|
||||
_is_flow_node(new_nodes[-1])):
|
||||
# Error leafs/nodes don't have a defined start/end. Error
|
||||
# nodes might not end with a newline (e.g. if there's an
|
||||
# open `(`). Therefore ignore all of them unless they are
|
||||
|
||||
@@ -306,12 +306,12 @@ class ErrorFinder(Normalizer):
|
||||
|
||||
def visit_leaf(self, leaf):
|
||||
if leaf.type == 'error_leaf':
|
||||
if leaf.original_type in ('indent', 'error_dedent'):
|
||||
if leaf.token_type in ('INDENT', 'ERROR_DEDENT'):
|
||||
# Indents/Dedents itself never have a prefix. They are just
|
||||
# "pseudo" tokens that get removed by the syntax tree later.
|
||||
# Therefore in case of an error we also have to check for this.
|
||||
spacing = list(leaf.get_next_leaf()._split_prefix())[-1]
|
||||
if leaf.original_type == 'indent':
|
||||
if leaf.token_type == 'INDENT':
|
||||
message = 'unexpected indent'
|
||||
else:
|
||||
message = 'unindent does not match any outer indentation level'
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
from parso.python import tree
|
||||
from parso.python.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER,
|
||||
STRING, tok_name, NAME, FSTRING_STRING,
|
||||
FSTRING_START, FSTRING_END)
|
||||
from parso.python.token import PythonTokenTypes
|
||||
from parso.parser import BaseParser
|
||||
from parso.pgen2.parse import token_to_ilabel
|
||||
|
||||
|
||||
NAME = PythonTokenTypes.NAME
|
||||
INDENT = PythonTokenTypes.INDENT
|
||||
DEDENT = PythonTokenTypes.DEDENT
|
||||
|
||||
|
||||
class Parser(BaseParser):
|
||||
@@ -53,53 +55,33 @@ class Parser(BaseParser):
|
||||
|
||||
# Names/Keywords are handled separately
|
||||
_leaf_map = {
|
||||
STRING: tree.String,
|
||||
NUMBER: tree.Number,
|
||||
NEWLINE: tree.Newline,
|
||||
ENDMARKER: tree.EndMarker,
|
||||
FSTRING_STRING: tree.FStringString,
|
||||
FSTRING_START: tree.FStringStart,
|
||||
FSTRING_END: tree.FStringEnd,
|
||||
PythonTokenTypes.STRING: tree.String,
|
||||
PythonTokenTypes.NUMBER: tree.Number,
|
||||
PythonTokenTypes.NEWLINE: tree.Newline,
|
||||
PythonTokenTypes.ENDMARKER: tree.EndMarker,
|
||||
PythonTokenTypes.FSTRING_STRING: tree.FStringString,
|
||||
PythonTokenTypes.FSTRING_START: tree.FStringStart,
|
||||
PythonTokenTypes.FSTRING_END: tree.FStringEnd,
|
||||
}
|
||||
|
||||
def __init__(self, pgen_grammar, error_recovery=True, start_symbol='file_input'):
|
||||
super(Parser, self).__init__(pgen_grammar, start_symbol, error_recovery=error_recovery)
|
||||
def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'):
|
||||
super(Parser, self).__init__(pgen_grammar, start_nonterminal,
|
||||
error_recovery=error_recovery)
|
||||
|
||||
self.syntax_errors = []
|
||||
self._omit_dedent_list = []
|
||||
self._indent_counter = 0
|
||||
|
||||
# TODO do print absolute import detection here.
|
||||
# try:
|
||||
# del python_grammar_no_print_statement.keywords["print"]
|
||||
# except KeyError:
|
||||
# pass # Doesn't exist in the Python 3 grammar.
|
||||
|
||||
# if self.options["print_function"]:
|
||||
# python_grammar = pygram.python_grammar_no_print_statement
|
||||
# else:
|
||||
|
||||
def parse(self, tokens):
|
||||
if self._error_recovery:
|
||||
if self._start_symbol != 'file_input':
|
||||
if self._start_nonterminal != 'file_input':
|
||||
raise NotImplementedError
|
||||
|
||||
tokens = self._recovery_tokenize(tokens)
|
||||
|
||||
node = super(Parser, self).parse(tokens)
|
||||
return super(Parser, self).parse(tokens)
|
||||
|
||||
if self._start_symbol == 'file_input' != node.type:
|
||||
# If there's only one statement, we get back a non-module. That's
|
||||
# not what we want, we want a module, so we add it here:
|
||||
node = self.convert_node(
|
||||
self._pgen_grammar,
|
||||
self._pgen_grammar.symbol2number['file_input'],
|
||||
[node]
|
||||
)
|
||||
|
||||
return node
|
||||
|
||||
def convert_node(self, pgen_grammar, type, children):
|
||||
def convert_node(self, nonterminal, children):
|
||||
"""
|
||||
Convert raw node information to a PythonBaseNode instance.
|
||||
|
||||
@@ -107,149 +89,114 @@ class Parser(BaseParser):
|
||||
grammar rule produces a new complete node, so that the tree is build
|
||||
strictly bottom-up.
|
||||
"""
|
||||
# TODO REMOVE symbol, we don't want type here.
|
||||
symbol = pgen_grammar.number2symbol[type]
|
||||
try:
|
||||
return self.node_map[symbol](children)
|
||||
return self.node_map[nonterminal](children)
|
||||
except KeyError:
|
||||
if symbol == 'suite':
|
||||
if nonterminal == 'suite':
|
||||
# We don't want the INDENT/DEDENT in our parser tree. Those
|
||||
# leaves are just cancer. They are virtual leaves and not real
|
||||
# ones and therefore have pseudo start/end positions and no
|
||||
# prefixes. Just ignore them.
|
||||
children = [children[0]] + children[2:-1]
|
||||
elif symbol == 'list_if':
|
||||
elif nonterminal == 'list_if':
|
||||
# Make transitioning from 2 to 3 easier.
|
||||
symbol = 'comp_if'
|
||||
elif symbol == 'listmaker':
|
||||
nonterminal = 'comp_if'
|
||||
elif nonterminal == 'listmaker':
|
||||
# Same as list_if above.
|
||||
symbol = 'testlist_comp'
|
||||
return self.default_node(symbol, children)
|
||||
nonterminal = 'testlist_comp'
|
||||
return self.default_node(nonterminal, children)
|
||||
|
||||
def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos):
|
||||
def convert_leaf(self, type, value, prefix, start_pos):
|
||||
# print('leaf', repr(value), token.tok_name[type])
|
||||
if type == NAME:
|
||||
if value in pgen_grammar.keywords:
|
||||
if value in self._pgen_grammar.reserved_syntax_strings:
|
||||
return tree.Keyword(value, start_pos, prefix)
|
||||
else:
|
||||
return tree.Name(value, start_pos, prefix)
|
||||
|
||||
return self._leaf_map.get(type, tree.Operator)(value, start_pos, prefix)
|
||||
|
||||
def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
|
||||
add_token_callback):
|
||||
def get_symbol_and_nodes(stack):
|
||||
for dfa, state, (type_, nodes) in stack:
|
||||
symbol = pgen_grammar.number2symbol[type_]
|
||||
yield symbol, nodes
|
||||
|
||||
tos_nodes = stack.get_tos_nodes()
|
||||
def error_recovery(self, token):
|
||||
tos_nodes = self.stack[-1].nodes
|
||||
if tos_nodes:
|
||||
last_leaf = tos_nodes[-1].get_last_leaf()
|
||||
else:
|
||||
last_leaf = None
|
||||
|
||||
if self._start_symbol == 'file_input' and \
|
||||
(typ == ENDMARKER or typ == DEDENT and '\n' not in last_leaf.value):
|
||||
def reduce_stack(states, newstate):
|
||||
# reduce
|
||||
state = newstate
|
||||
while states[state] == [(0, state)]:
|
||||
self.pgen_parser._pop()
|
||||
|
||||
dfa, state, (type_, nodes) = stack[-1]
|
||||
states, first = dfa
|
||||
|
||||
|
||||
if self._start_nonterminal == 'file_input' and \
|
||||
(token.type == PythonTokenTypes.ENDMARKER or
|
||||
token.type == DEDENT and '\n' not in last_leaf.value):
|
||||
# In Python statements need to end with a newline. But since it's
|
||||
# possible (and valid in Python ) that there's no newline at the
|
||||
# end of a file, we have to recover even if the user doesn't want
|
||||
# error recovery.
|
||||
#print('x', pprint.pprint(stack))
|
||||
ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value)
|
||||
|
||||
dfa, state, (type_, nodes) = stack[-1]
|
||||
symbol = pgen_grammar.number2symbol[type_]
|
||||
states, first = dfa
|
||||
arcs = states[state]
|
||||
# Look for a state with this label
|
||||
for i, newstate in arcs:
|
||||
if ilabel == i:
|
||||
if symbol == 'simple_stmt':
|
||||
# This is basically shifting
|
||||
stack[-1] = (dfa, newstate, (type_, nodes))
|
||||
|
||||
reduce_stack(states, newstate)
|
||||
add_token_callback(typ, value, start_pos, prefix)
|
||||
if self.stack[-1].dfa.from_rule == 'simple_stmt':
|
||||
try:
|
||||
plan = self.stack[-1].dfa.transitions[PythonTokenTypes.NEWLINE]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
if plan.next_dfa.is_final and not plan.dfa_pushes:
|
||||
# We are ignoring here that the newline would be
|
||||
# required for a simple_stmt.
|
||||
self.stack[-1].dfa = plan.next_dfa
|
||||
self._add_token(token)
|
||||
return
|
||||
# Check if we're at the right point
|
||||
#for symbol, nodes in get_symbol_and_nodes(stack):
|
||||
# self.pgen_parser._pop()
|
||||
|
||||
#break
|
||||
break
|
||||
#symbol = pgen_grammar.number2symbol[type_]
|
||||
|
||||
if not self._error_recovery:
|
||||
return super(Parser, self).error_recovery(
|
||||
pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
|
||||
add_token_callback)
|
||||
return super(Parser, self).error_recovery(token)
|
||||
|
||||
def current_suite(stack):
|
||||
# For now just discard everything that is not a suite or
|
||||
# file_input, if we detect an error.
|
||||
for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))):
|
||||
for until_index, stack_node in reversed(list(enumerate(stack))):
|
||||
# `suite` can sometimes be only simple_stmt, not stmt.
|
||||
if symbol == 'file_input':
|
||||
if stack_node.nonterminal == 'file_input':
|
||||
break
|
||||
elif symbol == 'suite' and len(nodes) > 1:
|
||||
# suites without an indent in them get discarded.
|
||||
break
|
||||
return index, symbol, nodes
|
||||
elif stack_node.nonterminal == 'suite':
|
||||
# In the case where we just have a newline we don't want to
|
||||
# do error recovery here. In all other cases, we want to do
|
||||
# error recovery.
|
||||
if len(stack_node.nodes) != 1:
|
||||
break
|
||||
return until_index
|
||||
|
||||
index, symbol, nodes = current_suite(stack)
|
||||
until_index = current_suite(self.stack)
|
||||
|
||||
# print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
|
||||
if self._stack_removal(pgen_grammar, stack, arcs, index + 1, value, start_pos):
|
||||
add_token_callback(typ, value, start_pos, prefix)
|
||||
if self._stack_removal(until_index + 1):
|
||||
self._add_token(token)
|
||||
else:
|
||||
typ, value, start_pos, prefix = token
|
||||
if typ == INDENT:
|
||||
# For every deleted INDENT we have to delete a DEDENT as well.
|
||||
# Otherwise the parser will get into trouble and DEDENT too early.
|
||||
self._omit_dedent_list.append(self._indent_counter)
|
||||
|
||||
error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix)
|
||||
stack[-1][2][1].append(error_leaf)
|
||||
error_leaf = tree.PythonErrorLeaf(typ.name, value, start_pos, prefix)
|
||||
self.stack[-1].nodes.append(error_leaf)
|
||||
|
||||
if symbol == 'suite':
|
||||
dfa, state, node = stack[-1]
|
||||
states, first = dfa
|
||||
arcs = states[state]
|
||||
intended_label = pgen_grammar.symbol2label['stmt']
|
||||
# Introduce a proper state transition. We're basically allowing
|
||||
# there to be no valid statements inside a suite.
|
||||
if [x[0] for x in arcs] == [intended_label]:
|
||||
new_state = arcs[0][1]
|
||||
stack[-1] = dfa, new_state, node
|
||||
tos = self.stack[-1]
|
||||
if tos.nonterminal == 'suite':
|
||||
# Need at least one statement in the suite. This happend with the
|
||||
# error recovery above.
|
||||
try:
|
||||
tos.dfa = tos.dfa.arcs['stmt']
|
||||
except KeyError:
|
||||
# We're already in a final state.
|
||||
pass
|
||||
|
||||
def _stack_removal(self, pgen_grammar, stack, arcs, start_index, value, start_pos):
|
||||
failed_stack = False
|
||||
found = False
|
||||
all_nodes = []
|
||||
for dfa, state, (type_, nodes) in stack[start_index:]:
|
||||
if nodes:
|
||||
found = True
|
||||
if found:
|
||||
failed_stack = True
|
||||
all_nodes += nodes
|
||||
if failed_stack:
|
||||
stack[start_index - 1][2][1].append(tree.PythonErrorNode(all_nodes))
|
||||
def _stack_removal(self, start_index):
|
||||
all_nodes = [node for stack_node in self.stack[start_index:] for node in stack_node.nodes]
|
||||
|
||||
stack[start_index:] = []
|
||||
return failed_stack
|
||||
if all_nodes:
|
||||
self.stack[start_index - 1].nodes.append(tree.PythonErrorNode(all_nodes))
|
||||
|
||||
self.stack[start_index:] = []
|
||||
return bool(all_nodes)
|
||||
|
||||
def _recovery_tokenize(self, tokens):
|
||||
for typ, value, start_pos, prefix in tokens:
|
||||
for token in tokens:
|
||||
typ = token[0]
|
||||
# print(tok_name[typ], repr(value), start_pos, repr(prefix))
|
||||
if typ == DEDENT:
|
||||
# We need to count indents, because if we just omit any DEDENT,
|
||||
@@ -262,4 +209,4 @@ class Parser(BaseParser):
|
||||
self._indent_counter -= 1
|
||||
elif typ == INDENT:
|
||||
self._indent_counter += 1
|
||||
yield typ, value, start_pos, prefix
|
||||
yield token
|
||||
|
||||
@@ -1,113 +1,27 @@
|
||||
from __future__ import absolute_import
|
||||
from itertools import count
|
||||
from token import *
|
||||
|
||||
from parso._compatibility import py_version
|
||||
|
||||
|
||||
_counter = count(N_TOKENS)
|
||||
# Never want to see this thing again.
|
||||
del N_TOKENS
|
||||
class TokenType(object):
|
||||
def __init__(self, name, contains_syntax=False):
|
||||
self.name = name
|
||||
self.contains_syntax = contains_syntax
|
||||
|
||||
COMMENT = next(_counter)
|
||||
tok_name[COMMENT] = 'COMMENT'
|
||||
|
||||
NL = next(_counter)
|
||||
tok_name[NL] = 'NL'
|
||||
|
||||
# Sets the attributes that don't exist in these tok_name versions.
|
||||
if py_version >= 30:
|
||||
BACKQUOTE = next(_counter)
|
||||
tok_name[BACKQUOTE] = 'BACKQUOTE'
|
||||
else:
|
||||
RARROW = next(_counter)
|
||||
tok_name[RARROW] = 'RARROW'
|
||||
ELLIPSIS = next(_counter)
|
||||
tok_name[ELLIPSIS] = 'ELLIPSIS'
|
||||
|
||||
if py_version < 35:
|
||||
ATEQUAL = next(_counter)
|
||||
tok_name[ATEQUAL] = 'ATEQUAL'
|
||||
|
||||
ERROR_DEDENT = next(_counter)
|
||||
tok_name[ERROR_DEDENT] = 'ERROR_DEDENT'
|
||||
|
||||
FSTRING_START = next(_counter)
|
||||
tok_name[FSTRING_START] = 'FSTRING_START'
|
||||
FSTRING_END = next(_counter)
|
||||
tok_name[FSTRING_END] = 'FSTRING_END'
|
||||
FSTRING_STRING = next(_counter)
|
||||
tok_name[FSTRING_STRING] = 'FSTRING_STRING'
|
||||
EXCLAMATION = next(_counter)
|
||||
tok_name[EXCLAMATION] = 'EXCLAMATION'
|
||||
|
||||
# Map from operator to number (since tokenize doesn't do this)
|
||||
|
||||
opmap_raw = """\
|
||||
( LPAR
|
||||
) RPAR
|
||||
[ LSQB
|
||||
] RSQB
|
||||
: COLON
|
||||
, COMMA
|
||||
; SEMI
|
||||
+ PLUS
|
||||
- MINUS
|
||||
* STAR
|
||||
/ SLASH
|
||||
| VBAR
|
||||
& AMPER
|
||||
< LESS
|
||||
> GREATER
|
||||
= EQUAL
|
||||
. DOT
|
||||
% PERCENT
|
||||
` BACKQUOTE
|
||||
{ LBRACE
|
||||
} RBRACE
|
||||
@ AT
|
||||
== EQEQUAL
|
||||
!= NOTEQUAL
|
||||
<> NOTEQUAL
|
||||
<= LESSEQUAL
|
||||
>= GREATEREQUAL
|
||||
~ TILDE
|
||||
^ CIRCUMFLEX
|
||||
<< LEFTSHIFT
|
||||
>> RIGHTSHIFT
|
||||
** DOUBLESTAR
|
||||
+= PLUSEQUAL
|
||||
-= MINEQUAL
|
||||
*= STAREQUAL
|
||||
/= SLASHEQUAL
|
||||
%= PERCENTEQUAL
|
||||
&= AMPEREQUAL
|
||||
|= VBAREQUAL
|
||||
@= ATEQUAL
|
||||
^= CIRCUMFLEXEQUAL
|
||||
<<= LEFTSHIFTEQUAL
|
||||
>>= RIGHTSHIFTEQUAL
|
||||
**= DOUBLESTAREQUAL
|
||||
// DOUBLESLASH
|
||||
//= DOUBLESLASHEQUAL
|
||||
-> RARROW
|
||||
... ELLIPSIS
|
||||
! EXCLAMATION
|
||||
"""
|
||||
|
||||
opmap = {}
|
||||
for line in opmap_raw.splitlines():
|
||||
op, name = line.split()
|
||||
opmap[op] = globals()[name]
|
||||
def __repr__(self):
|
||||
return '%s(%s)' % (self.__class__.__name__, self.name)
|
||||
|
||||
|
||||
def generate_token_id(string):
|
||||
class TokenTypes(object):
|
||||
"""
|
||||
Uses a token in the grammar (e.g. `'+'` or `'and'`returns the corresponding
|
||||
ID for it. The strings are part of the grammar file.
|
||||
Basically an enum, but Python 2 doesn't have enums in the standard library.
|
||||
"""
|
||||
try:
|
||||
return opmap[string]
|
||||
except KeyError:
|
||||
pass
|
||||
return globals()[string]
|
||||
def __init__(self, names, contains_syntax):
|
||||
for name in names:
|
||||
setattr(self, name, TokenType(name, contains_syntax=name in contains_syntax))
|
||||
|
||||
|
||||
PythonTokenTypes = TokenTypes((
|
||||
'STRING', 'NUMBER', 'NAME', 'ERRORTOKEN', 'NEWLINE', 'INDENT', 'DEDENT',
|
||||
'ERROR_DEDENT', 'FSTRING_STRING', 'FSTRING_START', 'FSTRING_END', 'OP',
|
||||
'ENDMARKER'),
|
||||
contains_syntax=('NAME', 'OP'),
|
||||
)
|
||||
|
||||
@@ -18,17 +18,29 @@ from collections import namedtuple
|
||||
import itertools as _itertools
|
||||
from codecs import BOM_UTF8
|
||||
|
||||
from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap,
|
||||
NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT,
|
||||
ERROR_DEDENT, FSTRING_STRING, FSTRING_START,
|
||||
FSTRING_END)
|
||||
from parso.python.token import PythonTokenTypes
|
||||
from parso._compatibility import py_version
|
||||
from parso.utils import split_lines
|
||||
|
||||
|
||||
STRING = PythonTokenTypes.STRING
|
||||
NAME = PythonTokenTypes.NAME
|
||||
NUMBER = PythonTokenTypes.NUMBER
|
||||
OP = PythonTokenTypes.OP
|
||||
NEWLINE = PythonTokenTypes.NEWLINE
|
||||
INDENT = PythonTokenTypes.INDENT
|
||||
DEDENT = PythonTokenTypes.DEDENT
|
||||
ENDMARKER = PythonTokenTypes.ENDMARKER
|
||||
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
|
||||
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
|
||||
FSTRING_START = PythonTokenTypes.FSTRING_START
|
||||
FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
|
||||
FSTRING_END = PythonTokenTypes.FSTRING_END
|
||||
|
||||
TokenCollection = namedtuple(
|
||||
'TokenCollection',
|
||||
'pseudo_token single_quoted triple_quoted endpats fstring_pattern_map always_break_tokens',
|
||||
'pseudo_token single_quoted triple_quoted endpats whitespace '
|
||||
'fstring_pattern_map always_break_tokens',
|
||||
)
|
||||
|
||||
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
|
||||
@@ -114,6 +126,7 @@ def _create_token_collection(version_info):
|
||||
# Note: we use unicode matching for names ("\w") but ascii matching for
|
||||
# number literals.
|
||||
Whitespace = r'[ \f\t]*'
|
||||
whitespace = _compile(Whitespace)
|
||||
Comment = r'#[^\r\n]*'
|
||||
Name = r'\w+'
|
||||
|
||||
@@ -225,7 +238,7 @@ def _create_token_collection(version_info):
|
||||
pseudo_token_compiled = _compile(PseudoToken)
|
||||
return TokenCollection(
|
||||
pseudo_token_compiled, single_quoted, triple_quoted, endpats,
|
||||
fstring_pattern_map, ALWAYS_BREAK_TOKENS
|
||||
whitespace, fstring_pattern_map, ALWAYS_BREAK_TOKENS
|
||||
)
|
||||
|
||||
|
||||
@@ -240,12 +253,9 @@ class Token(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
|
||||
|
||||
|
||||
class PythonToken(Token):
|
||||
def _get_type_name(self, exact=True):
|
||||
return tok_name[self.type]
|
||||
|
||||
def __repr__(self):
|
||||
return ('TokenInfo(type=%s, string=%r, start=%r, prefix=%r)' %
|
||||
self._replace(type=self._get_type_name()))
|
||||
return ('TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)' %
|
||||
self._replace(type=self.type.name))
|
||||
|
||||
|
||||
class FStringNode(object):
|
||||
@@ -354,7 +364,8 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
||||
token. This idea comes from lib2to3. The prefix contains all information
|
||||
that is irrelevant for the parser like newlines in parentheses or comments.
|
||||
"""
|
||||
pseudo_token, single_quoted, triple_quoted, endpats, fstring_pattern_map, always_break_tokens, = \
|
||||
pseudo_token, single_quoted, triple_quoted, endpats, whitespace, \
|
||||
fstring_pattern_map, always_break_tokens, = \
|
||||
_get_token_collection(version_info)
|
||||
paren_level = 0 # count parentheses
|
||||
indents = [0]
|
||||
@@ -393,7 +404,9 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
||||
endmatch = endprog.match(line)
|
||||
if endmatch:
|
||||
pos = endmatch.end(0)
|
||||
yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix)
|
||||
yield PythonToken(
|
||||
STRING, contstr + line[:pos],
|
||||
contstr_start, prefix)
|
||||
contstr = ''
|
||||
contline = None
|
||||
else:
|
||||
@@ -435,10 +448,14 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
||||
|
||||
pseudomatch = pseudo_token.match(line, pos)
|
||||
if not pseudomatch: # scan for tokens
|
||||
txt = line[pos:]
|
||||
if txt.endswith('\n'):
|
||||
if line.endswith('\n'):
|
||||
new_line = True
|
||||
yield PythonToken(ERRORTOKEN, txt, (lnum, pos), additional_prefix)
|
||||
match = whitespace.match(line, pos)
|
||||
pos = match.end()
|
||||
yield PythonToken(
|
||||
ERRORTOKEN, line[pos:], (lnum, pos),
|
||||
additional_prefix + match.group(0)
|
||||
)
|
||||
additional_prefix = ''
|
||||
break
|
||||
|
||||
@@ -564,13 +581,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
||||
and fstring_stack[-1].parentheses_count == 1:
|
||||
fstring_stack[-1].format_spec_count += 1
|
||||
|
||||
try:
|
||||
# This check is needed in any case to check if it's a valid
|
||||
# operator or just some random unicode character.
|
||||
typ = opmap[token]
|
||||
except KeyError:
|
||||
typ = ERRORTOKEN
|
||||
yield PythonToken(typ, token, spos, prefix)
|
||||
yield PythonToken(OP, token, spos, prefix)
|
||||
|
||||
if contstr:
|
||||
yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)
|
||||
|
||||
@@ -60,7 +60,6 @@ _GET_DEFINITION_TYPES = set([
|
||||
_IMPORTS = set(['import_name', 'import_from'])
|
||||
|
||||
|
||||
|
||||
class DocstringMixin(object):
|
||||
__slots__ = ()
|
||||
|
||||
@@ -125,7 +124,7 @@ class PythonLeaf(PythonMixin, Leaf):
|
||||
# indent error leafs somehow? No idea how, though.
|
||||
previous_leaf = self.get_previous_leaf()
|
||||
if previous_leaf is not None and previous_leaf.type == 'error_leaf' \
|
||||
and previous_leaf.original_type in ('indent', 'error_dedent'):
|
||||
and previous_leaf.token_type in ('INDENT', 'ERROR_DEDENT'):
|
||||
previous_leaf = previous_leaf.get_previous_leaf()
|
||||
|
||||
if previous_leaf is None:
|
||||
@@ -133,7 +132,6 @@ class PythonLeaf(PythonMixin, Leaf):
|
||||
return previous_leaf.end_pos
|
||||
|
||||
|
||||
|
||||
class _LeafWithoutNewlines(PythonLeaf):
|
||||
"""
|
||||
Simply here to optimize performance.
|
||||
@@ -166,6 +164,10 @@ class EndMarker(_LeafWithoutNewlines):
|
||||
__slots__ = ()
|
||||
type = 'endmarker'
|
||||
|
||||
@utf8_repr
|
||||
def __repr__(self):
|
||||
return "<%s: prefix=%s>" % (type(self).__name__, repr(self.prefix))
|
||||
|
||||
|
||||
class Newline(PythonLeaf):
|
||||
"""Contains NEWLINE and ENDMARKER tokens."""
|
||||
@@ -235,7 +237,6 @@ class Name(_LeafWithoutNewlines):
|
||||
return None
|
||||
|
||||
|
||||
|
||||
class Literal(PythonLeaf):
|
||||
__slots__ = ()
|
||||
|
||||
@@ -653,6 +654,7 @@ class Function(ClassOrFunc):
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
|
||||
class Lambda(Function):
|
||||
"""
|
||||
Lambdas are basically trimmed functions, so give it the same interface.
|
||||
|
||||
@@ -229,6 +229,7 @@ class Leaf(NodeOrLeaf):
|
||||
|
||||
class TypedLeaf(Leaf):
|
||||
__slots__ = ('type',)
|
||||
|
||||
def __init__(self, type, value, start_pos, prefix=''):
|
||||
super(TypedLeaf, self).__init__(value, start_pos, prefix)
|
||||
self.type = type
|
||||
@@ -351,13 +352,13 @@ class ErrorLeaf(Leaf):
|
||||
A leaf that is either completely invalid in a language (like `$` in Python)
|
||||
or is invalid at that position. Like the star in `1 +* 1`.
|
||||
"""
|
||||
__slots__ = ('original_type',)
|
||||
__slots__ = ('token_type',)
|
||||
type = 'error_leaf'
|
||||
|
||||
def __init__(self, original_type, value, start_pos, prefix=''):
|
||||
def __init__(self, token_type, value, start_pos, prefix=''):
|
||||
super(ErrorLeaf, self).__init__(value, start_pos, prefix)
|
||||
self.original_type = original_type
|
||||
self.token_type = token_type
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s: %s:%s, %s>" % \
|
||||
(type(self).__name__, self.original_type, repr(self.value), self.start_pos)
|
||||
(type(self).__name__, self.token_type, repr(self.value), self.start_pos)
|
||||
|
||||
@@ -502,3 +502,8 @@ def test_endmarker_newline(differ):
|
||||
|
||||
differ.initialize(code1)
|
||||
differ.parse(code2, parsers=2, copies=2, expect_error_leaves=True)
|
||||
|
||||
|
||||
def test_newlines_at_end(differ):
|
||||
differ.initialize('a\n\n')
|
||||
differ.parse('a\n', copies=1)
|
||||
|
||||
61
test/test_error_recovery.py
Normal file
61
test/test_error_recovery.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from parso import parse
|
||||
|
||||
|
||||
def test_with_stmt():
|
||||
module = parse('with x: f.\na')
|
||||
assert module.children[0].type == 'with_stmt'
|
||||
w, with_item, colon, f = module.children[0].children
|
||||
assert f.type == 'error_node'
|
||||
assert f.get_code(include_prefix=False) == 'f.'
|
||||
|
||||
assert module.children[2].type == 'name'
|
||||
|
||||
|
||||
def test_one_line_function(each_version):
|
||||
module = parse('def x(): f.', version=each_version)
|
||||
assert module.children[0].type == 'funcdef'
|
||||
def_, name, parameters, colon, f = module.children[0].children
|
||||
assert f.type == 'error_node'
|
||||
|
||||
module = parse('def x(a:', version=each_version)
|
||||
func = module.children[0]
|
||||
assert func.type == 'error_node'
|
||||
if each_version.startswith('2'):
|
||||
assert func.children[-1].value == 'a'
|
||||
else:
|
||||
assert func.children[-1] == ':'
|
||||
|
||||
|
||||
def test_if_else():
|
||||
module = parse('if x:\n f.\nelse:\n g(')
|
||||
if_stmt = module.children[0]
|
||||
if_, test, colon, suite1, else_, colon, suite2 = if_stmt.children
|
||||
f = suite1.children[1]
|
||||
assert f.type == 'error_node'
|
||||
assert f.children[0].value == 'f'
|
||||
assert f.children[1].value == '.'
|
||||
g = suite2.children[1]
|
||||
assert g.children[0].value == 'g'
|
||||
assert g.children[1].value == '('
|
||||
|
||||
|
||||
def test_if_stmt():
|
||||
module = parse('if x: f.\nelse: g(')
|
||||
if_stmt = module.children[0]
|
||||
assert if_stmt.type == 'if_stmt'
|
||||
if_, test, colon, f = if_stmt.children
|
||||
assert f.type == 'error_node'
|
||||
assert f.children[0].value == 'f'
|
||||
assert f.children[1].value == '.'
|
||||
|
||||
assert module.children[1].type == 'newline'
|
||||
assert module.children[1].value == '\n'
|
||||
assert module.children[2].type == 'error_leaf'
|
||||
assert module.children[2].value == 'else'
|
||||
assert module.children[3].type == 'error_leaf'
|
||||
assert module.children[3].value == ':'
|
||||
|
||||
in_else_stmt = module.children[4]
|
||||
assert in_else_stmt.type == 'error_node'
|
||||
assert in_else_stmt.children[0].value == 'g'
|
||||
assert in_else_stmt.children[1].value == '('
|
||||
@@ -12,6 +12,8 @@ import pytest
|
||||
|
||||
from parso import load_grammar
|
||||
from parso import ParserSyntaxError
|
||||
from parso.pgen2 import generate_grammar
|
||||
from parso.python import tokenize
|
||||
|
||||
|
||||
def _parse(code, version=None):
|
||||
@@ -270,3 +272,19 @@ def py_br(each_version):
|
||||
def test_py3_rb(works_ge_py3):
|
||||
works_ge_py3.parse("rb'1'")
|
||||
works_ge_py3.parse("RB'1'")
|
||||
|
||||
|
||||
def test_left_recursion():
|
||||
with pytest.raises(ValueError, match='left recursion'):
|
||||
generate_grammar('foo: foo NAME\n', tokenize.PythonTokenTypes)
|
||||
|
||||
|
||||
def test_ambiguities():
|
||||
with pytest.raises(ValueError, match='ambiguous'):
|
||||
generate_grammar('foo: bar | baz\nbar: NAME\nbaz: NAME\n', tokenize.PythonTokenTypes)
|
||||
|
||||
with pytest.raises(ValueError, match='ambiguous'):
|
||||
generate_grammar('''foo: bar | baz\nbar: 'x'\nbaz: "x"\n''', tokenize.PythonTokenTypes)
|
||||
|
||||
with pytest.raises(ValueError, match='ambiguous'):
|
||||
generate_grammar('''foo: bar | 'x'\nbar: 'x'\n''', tokenize.PythonTokenTypes)
|
||||
|
||||
@@ -6,14 +6,25 @@ import pytest
|
||||
|
||||
from parso._compatibility import py_version
|
||||
from parso.utils import split_lines, parse_version_string
|
||||
from parso.python.token import (
|
||||
NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER, ERROR_DEDENT,
|
||||
FSTRING_START)
|
||||
from parso.python.token import PythonTokenTypes
|
||||
from parso.python import tokenize
|
||||
from parso import parse
|
||||
from parso.python.tokenize import PythonToken
|
||||
|
||||
|
||||
# To make it easier to access some of the token types, just put them here.
|
||||
NAME = PythonTokenTypes.NAME
|
||||
NEWLINE = PythonTokenTypes.NEWLINE
|
||||
STRING = PythonTokenTypes.STRING
|
||||
INDENT = PythonTokenTypes.INDENT
|
||||
DEDENT = PythonTokenTypes.DEDENT
|
||||
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
|
||||
OP = PythonTokenTypes.OP
|
||||
ENDMARKER = PythonTokenTypes.ENDMARKER
|
||||
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
|
||||
FSTRING_START = PythonTokenTypes.FSTRING_START
|
||||
|
||||
|
||||
def _get_token_list(string):
|
||||
# Load the current version.
|
||||
version_info = parse_version_string()
|
||||
@@ -127,7 +138,7 @@ def test_identifier_contains_unicode():
|
||||
else:
|
||||
# Unicode tokens in Python 2 seem to be identified as operators.
|
||||
# They will be ignored in the parser, that's ok.
|
||||
assert unicode_token[0] == tokenize.ERRORTOKEN
|
||||
assert unicode_token[0] == OP
|
||||
|
||||
|
||||
def test_quoted_strings():
|
||||
@@ -187,17 +198,17 @@ def test_ur_literals():
|
||||
|
||||
def test_error_literal():
|
||||
error_token, endmarker = _get_token_list('"\n')
|
||||
assert error_token.type == tokenize.ERRORTOKEN
|
||||
assert error_token.type == ERRORTOKEN
|
||||
assert endmarker.prefix == ''
|
||||
assert error_token.string == '"\n'
|
||||
assert endmarker.type == tokenize.ENDMARKER
|
||||
assert endmarker.type == ENDMARKER
|
||||
assert endmarker.prefix == ''
|
||||
|
||||
bracket, error_token, endmarker = _get_token_list('( """')
|
||||
assert error_token.type == tokenize.ERRORTOKEN
|
||||
assert error_token.type == ERRORTOKEN
|
||||
assert error_token.prefix == ' '
|
||||
assert error_token.string == '"""'
|
||||
assert endmarker.type == tokenize.ENDMARKER
|
||||
assert endmarker.type == ENDMARKER
|
||||
assert endmarker.prefix == ''
|
||||
|
||||
|
||||
@@ -227,3 +238,11 @@ def test_endmarker_end_pos():
|
||||
def test_indentation(code, types):
|
||||
actual_types = [t.type for t in _get_token_list(code)]
|
||||
assert actual_types == types + [ENDMARKER]
|
||||
|
||||
|
||||
def test_error_string():
|
||||
t1, endmarker = _get_token_list(' "\n')
|
||||
assert t1.type == ERRORTOKEN
|
||||
assert t1.prefix == ' '
|
||||
assert t1.string == '"\n'
|
||||
assert endmarker.string == ''
|
||||
|
||||
Reference in New Issue
Block a user