diff --git a/parso/python/prefix.py b/parso/python/prefix.py index 38c2c54..9cf5712 100644 --- a/parso/python/prefix.py +++ b/parso/python/prefix.py @@ -1,4 +1,5 @@ import re +from codecs import BOM_UTF8 from parso.python.tokenize import group @@ -34,15 +35,18 @@ class PrefixPart(object): ) +unicode_bom = BOM_UTF8.decode('utf-8') + _comment = r'#[^\n\r\f]*' _backslash = r'\\\r?\n' _newline = r'\r?\n' _form_feed = r'\f' _only_spacing = '$' _spacing = r'[ \t]*' +_bom = unicode_bom _regex = group( - _comment, _backslash, _newline, _form_feed, _only_spacing, + _comment, _backslash, _newline, _form_feed, _only_spacing, _bom, capture=True ) _regex = re.compile(group(_spacing, capture=True) + _regex) @@ -54,6 +58,7 @@ _types = { '\f': 'formfeed', '\n': 'newline', '\r': 'newline', + unicode_bom: 'bom' } diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index a8a6ac2..19a3a8f 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -15,6 +15,7 @@ import string import re from collections import namedtuple import itertools as _itertools +from codecs import BOM_UTF8 from parso.python.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap, NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT) @@ -22,6 +23,8 @@ from parso._compatibility import py_version, u from parso.utils import splitlines +BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') + if py_version >= 30: # Python 3 has str.isidentifier() to check if a char is a valid identifier is_identifier = str.isidentifier @@ -227,9 +230,15 @@ def tokenize_lines(lines): new_line = True prefix = '' # Should never be required, but here for safety additional_prefix = '' + first = True for lnum, line in enumerate(lines, 1): # loop over lines in stream - pos, max = 0, len(line) + if first: + if line.startswith(BOM_UTF8_STRING): + additional_prefix = BOM_UTF8_STRING + line = line[1:] + first = False + pos, max = 0, len(line) if contstr: # continued string endmatch = endprog.match(line) if endmatch: diff --git a/test/test_prefix.py b/test/test_prefix.py index 88f40e7..a628b9f 100644 --- a/test/test_prefix.py +++ b/test/test_prefix.py @@ -1,9 +1,12 @@ from itertools import zip_longest +from codecs import BOM_UTF8 import pytest import parso +unicode_bom = BOM_UTF8.decode('utf-8') + @pytest.mark.parametrize(('string', 'tokens'), [ ('', ['']), @@ -48,6 +51,7 @@ def test_simple_prefix_splitting(string, tokens): ('\\\n', ['backslash', 'spacing']), (' \t', ['spacing']), (' \t ', ['spacing']), + (unicode_bom + ' # ', ['bom', 'comment', 'spacing']), ]) def test_prefix_splitting_types(string, types): tree = parso.parse(string) diff --git a/test/test_utils.py b/test/test_utils.py index bbdd897..f9a92fc 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,4 +1,7 @@ +from codecs import BOM_UTF8 + from parso.utils import splitlines, source_to_unicode +import parso def test_splitlines_no_keepends(): @@ -25,3 +28,17 @@ def test_source_to_unicode_unicode_text(): actual = source_to_unicode(source) expected = source.decode('utf-8') assert actual == expected + + +def test_utf8_bom(): + unicode_bom = BOM_UTF8.decode('utf-8') + + module = parso.parse(unicode_bom) + endmarker = module.children[0] + assert endmarker.type == 'endmarker' + assert unicode_bom == endmarker.prefix + + module = parso.parse(unicode_bom + 'foo = 1') + expr_stmt = module.children[0] + assert expr_stmt.type == 'expr_stmt' + assert unicode_bom == expr_stmt.get_first_leaf().prefix