Add the utf-8 bom to the prefix in the tokenizer.

This commit is contained in:
Dave Halter
2017-07-10 23:26:15 +02:00
parent b3923e65e8
commit 859c48170e
4 changed files with 37 additions and 2 deletions

View File

@@ -1,9 +1,12 @@
from itertools import zip_longest
from codecs import BOM_UTF8
import pytest
import parso
unicode_bom = BOM_UTF8.decode('utf-8')
@pytest.mark.parametrize(('string', 'tokens'), [
('', ['']),
@@ -48,6 +51,7 @@ def test_simple_prefix_splitting(string, tokens):
('\\\n', ['backslash', 'spacing']),
(' \t', ['spacing']),
(' \t ', ['spacing']),
(unicode_bom + ' # ', ['bom', 'comment', 'spacing']),
])
def test_prefix_splitting_types(string, types):
tree = parso.parse(string)

View File

@@ -1,4 +1,7 @@
from codecs import BOM_UTF8
from parso.utils import splitlines, source_to_unicode
import parso
def test_splitlines_no_keepends():
@@ -25,3 +28,17 @@ def test_source_to_unicode_unicode_text():
actual = source_to_unicode(source)
expected = source.decode('utf-8')
assert actual == expected
def test_utf8_bom():
unicode_bom = BOM_UTF8.decode('utf-8')
module = parso.parse(unicode_bom)
endmarker = module.children[0]
assert endmarker.type == 'endmarker'
assert unicode_bom == endmarker.prefix
module = parso.parse(unicode_bom + 'foo = 1')
expr_stmt = module.children[0]
assert expr_stmt.type == 'expr_stmt'
assert unicode_bom == expr_stmt.get_first_leaf().prefix