From d9d3740c92ec66feccc8b7acbdd56d658d6c3e57 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Tue, 16 Dec 2014 01:52:15 +0100 Subject: [PATCH] Trying to replace the old pgen2 token module with a token module more tightly coupled to the standard library. --- jedi/parser/__init__.py | 18 ++++---- jedi/parser/pgen2/grammar.py | 62 ---------------------------- jedi/parser/pgen2/pgen.py | 39 ++++++++++-------- jedi/parser/token.py | 79 ++++++++++++++++++++++++++++++++++++ jedi/parser/tokenize.py | 3 ++ 5 files changed, 112 insertions(+), 89 deletions(-) create mode 100644 jedi/parser/token.py diff --git a/jedi/parser/__init__.py b/jedi/parser/__init__.py index 0079cef3..6c8e7e6b 100644 --- a/jedi/parser/__init__.py +++ b/jedi/parser/__init__.py @@ -19,7 +19,7 @@ import os from jedi.parser import tree as pt from jedi.parser import tokenize -from jedi.parser.pgen2 import grammar +from jedi.parser import token from jedi.parser.pgen2.pgen import generate_grammar from jedi.parser.pgen2.parse import PgenParser @@ -184,11 +184,11 @@ class Parser(object): arr = self.scope_names_stack[-1].setdefault(name.value, []) arr.append(name) return name - elif type == tokenize.STRING: + elif type == token.STRING: return pt.String(value, start_pos, prefix) - elif type == tokenize.NUMBER: + elif type == token.NUMBER: return pt.Number(value, start_pos, prefix) - elif type in (tokenize.NEWLINE, tokenize.ENDMARKER): + elif type in (token.NEWLINE, token.ENDMARKER): return pt.Whitespace(value, start_pos, prefix) else: return pt.Operator(value, start_pos, prefix) @@ -228,12 +228,12 @@ class Parser(object): nodes = suite_nodes stack[index] - #print('err', tokenize.tok_name[typ], repr(value), start_pos, len(stack), index) + #print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index) self._stack_removal(grammar, stack, index + 1, value, start_pos) if value in ('import', 'from', 'class', 'def', 'try', 'while', 'return'): # Those can always be new statements. add_token_callback(typ, value, prefix, start_pos) - elif typ == tokenize.DEDENT: + elif typ == token.DEDENT: if symbol == 'suite': # If a function or anything else contains a suite that is # "empty" (just NEWLINE/INDENT), we remove it. If it's not @@ -282,7 +282,7 @@ class Parser(object): def _tokenize(self, tokenizer): """ while first_pos[1] <= self._scope.start_pos[1] \ - and (token_type == tokenize.NAME or tok_str in ('(', '['))\ + and (token_type == token.NAME or tok_str in ('(', '['))\ and self._scope != self.module: self._scope.end_pos = first_pos self._scope = self._scope.parent @@ -292,8 +292,8 @@ class Parser(object): """ for typ, value, start_pos, prefix in tokenizer: - if typ == tokenize.OP: - typ = grammar.opmap[value] + if typ == token.OP: + typ = token.opmap[value] yield typ, value, prefix, start_pos def __repr__(self): diff --git a/jedi/parser/pgen2/grammar.py b/jedi/parser/pgen2/grammar.py index 01db4798..414c0dbe 100644 --- a/jedi/parser/pgen2/grammar.py +++ b/jedi/parser/pgen2/grammar.py @@ -19,9 +19,6 @@ fallback token code OP, but the parser needs the actual token code. # Python imports import pickle -# Local imports -from . import token - class Grammar(object): """Pgen parsing tables conversion class. @@ -126,62 +123,3 @@ class Grammar(object): print("labels") pprint(self.labels) print("start", self.start) - - -# Map from operator to number (since tokenize doesn't do this) - -opmap_raw = """ -( LPAR -) RPAR -[ LSQB -] RSQB -: COLON -, COMMA -; SEMI -+ PLUS -- MINUS -* STAR -/ SLASH -| VBAR -& AMPER -< LESS -> GREATER -= EQUAL -. DOT -% PERCENT -` BACKQUOTE -{ LBRACE -} RBRACE -@ AT -== EQEQUAL -!= NOTEQUAL -<> NOTEQUAL -<= LESSEQUAL ->= GREATEREQUAL -~ TILDE -^ CIRCUMFLEX -<< LEFTSHIFT ->> RIGHTSHIFT -** DOUBLESTAR -+= PLUSEQUAL --= MINEQUAL -*= STAREQUAL -/= SLASHEQUAL -%= PERCENTEQUAL -&= AMPEREQUAL -|= VBAREQUAL -^= CIRCUMFLEXEQUAL -<<= LEFTSHIFTEQUAL ->>= RIGHTSHIFTEQUAL -**= DOUBLESTAREQUAL -// DOUBLESLASH -//= DOUBLESLASHEQUAL --> RARROW -... ELLIPSIS -""" - -opmap = {} -for line in opmap_raw.splitlines(): - if line: - op, name = line.split() - opmap[op] = getattr(token, name) diff --git a/jedi/parser/pgen2/pgen.py b/jedi/parser/pgen2/pgen.py index 1ab5f699..fa2742dd 100644 --- a/jedi/parser/pgen2/pgen.py +++ b/jedi/parser/pgen2/pgen.py @@ -6,7 +6,9 @@ # Modifications are dual-licensed: MIT and PSF. # Pgen imports -from . import grammar, tokenize +from . import grammar +from jedi.parser import token +from jedi.parser import tokenize class ParserGenerator(object): @@ -74,9 +76,9 @@ class ParserGenerator(object): return ilabel else: # A named token (NAME, NUMBER, STRING) - itoken = getattr(tokenize, label, None) + itoken = getattr(token, label, None) assert isinstance(itoken, int), label - assert itoken in tokenize.tok_name, label + assert itoken in token.tok_name, label if itoken in c.tokens: return c.tokens[itoken] else: @@ -92,12 +94,12 @@ class ParserGenerator(object): if value in c.keywords: return c.keywords[value] else: - c.labels.append((tokenize.NAME, value)) + c.labels.append((token.NAME, value)) c.keywords[value] = ilabel return ilabel else: # An operator (any non-numeric token) - itoken = grammar.opmap[value] # Fails if unknown token + itoken = token.opmap[value] # Fails if unknown token if itoken in c.tokens: return c.tokens[itoken] else: @@ -147,14 +149,14 @@ class ParserGenerator(object): dfas = {} startsymbol = None # MSTART: (NEWLINE | RULE)* ENDMARKER - while self.type != tokenize.ENDMARKER: - while self.type == tokenize.NEWLINE: + while self.type != token.ENDMARKER: + while self.type == token.NEWLINE: self.gettoken() # RULE: NAME ':' RHS NEWLINE - name = self.expect(tokenize.NAME) - self.expect(tokenize.OP, ":") + name = self.expect(token.NAME) + self.expect(token.OP, ":") a, z = self.parse_rhs() - self.expect(tokenize.NEWLINE) + self.expect(token.NEWLINE) #self.dump_nfa(name, a, z) dfa = self.make_dfa(a, z) #self.dump_dfa(name, dfa) @@ -271,7 +273,7 @@ class ParserGenerator(object): # ALT: ITEM+ a, b = self.parse_item() while (self.value in ("(", "[") or - self.type in (tokenize.NAME, tokenize.STRING)): + self.type in (token.NAME, token.STRING)): c, d = self.parse_item() b.addarc(c) b = d @@ -282,7 +284,7 @@ class ParserGenerator(object): if self.value == "[": self.gettoken() a, z = self.parse_rhs() - self.expect(tokenize.OP, "]") + self.expect(token.OP, "]") a.addarc(z) return a, z else: @@ -302,9 +304,9 @@ class ParserGenerator(object): if self.value == "(": self.gettoken() a, z = self.parse_rhs() - self.expect(tokenize.OP, ")") + self.expect(token.OP, ")") return a, z - elif self.type in (tokenize.NAME, tokenize.STRING): + elif self.type in (token.NAME, token.STRING): a = NFAState() z = NFAState() a.addarc(z, self.value) @@ -324,9 +326,9 @@ class ParserGenerator(object): def gettoken(self): tup = next(self.generator) - while tup[0] in (tokenize.COMMENT, tokenize.NL): + while tup[0] in (token.COMMENT, token.NL): tup = next(self.generator) - self.type, self.value, self.begin, self.end, self.line = tup + self.type, self.value, self.begin, prefix = tup #print tokenize.tok_name[self.type], repr(self.value) def raise_error(self, msg, *args): @@ -335,8 +337,9 @@ class ParserGenerator(object): msg = msg % args except: msg = " ".join([msg] + list(map(str, args))) - raise SyntaxError(msg, (self.filename, self.end[0], - self.end[1], self.line)) + line = open(self.filename).readlines()[self.begin[0]] + raise SyntaxError(msg, (self.filename, self.begin[0], + self.begin[1], line)) class NFAState(object): diff --git a/jedi/parser/token.py b/jedi/parser/token.py new file mode 100644 index 00000000..49a5ada9 --- /dev/null +++ b/jedi/parser/token.py @@ -0,0 +1,79 @@ +from __future__ import absolute_import + +from jedi._compatibility import is_py3 +from token import * + + +COMMENT = N_TOKENS +tok_name[COMMENT] = 'COMMENT' +N_TOKENS += 1 + +NL = N_TOKENS +tok_name[NL] = 'NL' +N_TOKENS += 1 + +if is_py3: + BACKQUOTE = N_TOKENS + tok_name[BACKQUOTE] = 'BACKQUOTE' + N_TOKENS += 1 + + +# Map from operator to number (since tokenize doesn't do this) + +opmap_raw = """ +( LPAR +) RPAR +[ LSQB +] RSQB +: COLON +, COMMA +; SEMI ++ PLUS +- MINUS +* STAR +/ SLASH +| VBAR +& AMPER +< LESS +> GREATER += EQUAL +. DOT +% PERCENT +` BACKQUOTE +{ LBRACE +} RBRACE +@ AT +== EQEQUAL +!= NOTEQUAL +<> NOTEQUAL +<= LESSEQUAL +>= GREATEREQUAL +~ TILDE +^ CIRCUMFLEX +<< LEFTSHIFT +>> RIGHTSHIFT +** DOUBLESTAR ++= PLUSEQUAL +-= MINEQUAL +*= STAREQUAL +/= SLASHEQUAL +%= PERCENTEQUAL +&= AMPEREQUAL +|= VBAREQUAL +^= CIRCUMFLEXEQUAL +<<= LEFTSHIFTEQUAL +>>= RIGHTSHIFTEQUAL +**= DOUBLESTAREQUAL +// DOUBLESLASH +//= DOUBLESLASHEQUAL +-> RARROW +... ELLIPSIS +""" + +opmap = {} +for line in opmap_raw.splitlines(): + if line: + op, name = line.split() + if is_py3 and name == 'BACKQUOTE': + continue + opmap[op] = globals()[name] diff --git a/jedi/parser/tokenize.py b/jedi/parser/tokenize.py index d5e48df7..6f2c5c47 100644 --- a/jedi/parser/tokenize.py +++ b/jedi/parser/tokenize.py @@ -16,6 +16,9 @@ import re from io import StringIO from token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT) +import token + +from jedi._compatibility import is_py3 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")