Add the utf-8 bom to the prefix in the tokenizer.

This commit is contained in:
Dave Halter
2017-07-10 23:26:15 +02:00
parent b3923e65e8
commit 859c48170e
4 changed files with 37 additions and 2 deletions

View File

@@ -1,4 +1,5 @@
import re import re
from codecs import BOM_UTF8
from parso.python.tokenize import group from parso.python.tokenize import group
@@ -34,15 +35,18 @@ class PrefixPart(object):
) )
unicode_bom = BOM_UTF8.decode('utf-8')
_comment = r'#[^\n\r\f]*' _comment = r'#[^\n\r\f]*'
_backslash = r'\\\r?\n' _backslash = r'\\\r?\n'
_newline = r'\r?\n' _newline = r'\r?\n'
_form_feed = r'\f' _form_feed = r'\f'
_only_spacing = '$' _only_spacing = '$'
_spacing = r'[ \t]*' _spacing = r'[ \t]*'
_bom = unicode_bom
_regex = group( _regex = group(
_comment, _backslash, _newline, _form_feed, _only_spacing, _comment, _backslash, _newline, _form_feed, _only_spacing, _bom,
capture=True capture=True
) )
_regex = re.compile(group(_spacing, capture=True) + _regex) _regex = re.compile(group(_spacing, capture=True) + _regex)
@@ -54,6 +58,7 @@ _types = {
'\f': 'formfeed', '\f': 'formfeed',
'\n': 'newline', '\n': 'newline',
'\r': 'newline', '\r': 'newline',
unicode_bom: 'bom'
} }

View File

@@ -15,6 +15,7 @@ import string
import re import re
from collections import namedtuple from collections import namedtuple
import itertools as _itertools import itertools as _itertools
from codecs import BOM_UTF8
from parso.python.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap, from parso.python.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT) NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
@@ -22,6 +23,8 @@ from parso._compatibility import py_version, u
from parso.utils import splitlines from parso.utils import splitlines
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
if py_version >= 30: if py_version >= 30:
# Python 3 has str.isidentifier() to check if a char is a valid identifier # Python 3 has str.isidentifier() to check if a char is a valid identifier
is_identifier = str.isidentifier is_identifier = str.isidentifier
@@ -227,9 +230,15 @@ def tokenize_lines(lines):
new_line = True new_line = True
prefix = '' # Should never be required, but here for safety prefix = '' # Should never be required, but here for safety
additional_prefix = '' additional_prefix = ''
first = True
for lnum, line in enumerate(lines, 1): # loop over lines in stream for lnum, line in enumerate(lines, 1): # loop over lines in stream
pos, max = 0, len(line) if first:
if line.startswith(BOM_UTF8_STRING):
additional_prefix = BOM_UTF8_STRING
line = line[1:]
first = False
pos, max = 0, len(line)
if contstr: # continued string if contstr: # continued string
endmatch = endprog.match(line) endmatch = endprog.match(line)
if endmatch: if endmatch:

View File

@@ -1,9 +1,12 @@
from itertools import zip_longest from itertools import zip_longest
from codecs import BOM_UTF8
import pytest import pytest
import parso import parso
unicode_bom = BOM_UTF8.decode('utf-8')
@pytest.mark.parametrize(('string', 'tokens'), [ @pytest.mark.parametrize(('string', 'tokens'), [
('', ['']), ('', ['']),
@@ -48,6 +51,7 @@ def test_simple_prefix_splitting(string, tokens):
('\\\n', ['backslash', 'spacing']), ('\\\n', ['backslash', 'spacing']),
(' \t', ['spacing']), (' \t', ['spacing']),
(' \t ', ['spacing']), (' \t ', ['spacing']),
(unicode_bom + ' # ', ['bom', 'comment', 'spacing']),
]) ])
def test_prefix_splitting_types(string, types): def test_prefix_splitting_types(string, types):
tree = parso.parse(string) tree = parso.parse(string)

View File

@@ -1,4 +1,7 @@
from codecs import BOM_UTF8
from parso.utils import splitlines, source_to_unicode from parso.utils import splitlines, source_to_unicode
import parso
def test_splitlines_no_keepends(): def test_splitlines_no_keepends():
@@ -25,3 +28,17 @@ def test_source_to_unicode_unicode_text():
actual = source_to_unicode(source) actual = source_to_unicode(source)
expected = source.decode('utf-8') expected = source.decode('utf-8')
assert actual == expected assert actual == expected
def test_utf8_bom():
unicode_bom = BOM_UTF8.decode('utf-8')
module = parso.parse(unicode_bom)
endmarker = module.children[0]
assert endmarker.type == 'endmarker'
assert unicode_bom == endmarker.prefix
module = parso.parse(unicode_bom + 'foo = 1')
expr_stmt = module.children[0]
assert expr_stmt.type == 'expr_stmt'
assert unicode_bom == expr_stmt.get_first_leaf().prefix