Add the utf-8 bom to the prefix in the tokenizer.

2025-12-06 12:54:29 +08:00 · 2017-07-10 23:26:15 +02:00
parent b3923e65e8
commit 859c48170e
4 changed files with 37 additions and 2 deletions
--- a/parso/python/prefix.py
+++ b/parso/python/prefix.py
@@ -1,4 +1,5 @@
 import re
+from codecs import BOM_UTF8

 from parso.python.tokenize import group

@@ -34,15 +35,18 @@ class PrefixPart(object):
        )


+unicode_bom = BOM_UTF8.decode('utf-8')
+
 _comment = r'#[^\n\r\f]*'
 _backslash = r'\\\r?\n'
 _newline = r'\r?\n'
 _form_feed = r'\f'
 _only_spacing = '$'
 _spacing = r'[ \t]*'
+_bom = unicode_bom

 _regex = group(
-    _comment, _backslash, _newline, _form_feed, _only_spacing,
+    _comment, _backslash, _newline, _form_feed, _only_spacing, _bom,
    capture=True
 )
 _regex = re.compile(group(_spacing, capture=True) + _regex)
@@ -54,6 +58,7 @@ _types = {
    '\f': 'formfeed',
    '\n': 'newline',
    '\r': 'newline',
+    unicode_bom: 'bom'
 }


--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -15,6 +15,7 @@ import string
 import re
 from collections import namedtuple
 import itertools as _itertools
+from codecs import BOM_UTF8

 from parso.python.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
                                NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
@@ -22,6 +23,8 @@ from parso._compatibility import py_version, u
 from parso.utils import splitlines


+BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
+
 if py_version >= 30:
    # Python 3 has str.isidentifier() to check if a char is a valid identifier
    is_identifier = str.isidentifier
@@ -227,9 +230,15 @@ def tokenize_lines(lines):
    new_line = True
    prefix = ''  # Should never be required, but here for safety
    additional_prefix = ''
+    first = True
    for lnum, line in enumerate(lines, 1):  # loop over lines in stream
-        pos, max = 0, len(line)
+        if first:
+            if line.startswith(BOM_UTF8_STRING):
+                additional_prefix = BOM_UTF8_STRING
+                line = line[1:]
+            first = False

+        pos, max = 0, len(line)
        if contstr:                                         # continued string
            endmatch = endprog.match(line)
            if endmatch:
--- a/test/test_prefix.py
+++ b/test/test_prefix.py
@@ -1,9 +1,12 @@
 from itertools import zip_longest
+from codecs import BOM_UTF8

 import pytest

 import parso

+unicode_bom = BOM_UTF8.decode('utf-8')
+

@pytest.mark.parametrize(('string', 'tokens'), [
    ('', ['']),
@@ -48,6 +51,7 @@ def test_simple_prefix_splitting(string, tokens):
    ('\\\n', ['backslash', 'spacing']),
    (' \t', ['spacing']),
    (' \t ', ['spacing']),
+    (unicode_bom + ' # ', ['bom', 'comment', 'spacing']),
 ])
 def test_prefix_splitting_types(string, types):
    tree = parso.parse(string)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1,4 +1,7 @@
+from codecs import BOM_UTF8
+
 from parso.utils import splitlines, source_to_unicode
+import parso


 def test_splitlines_no_keepends():
@@ -25,3 +28,17 @@ def test_source_to_unicode_unicode_text():
    actual = source_to_unicode(source)
    expected = source.decode('utf-8')
    assert actual == expected
+
+
+def test_utf8_bom():
+    unicode_bom = BOM_UTF8.decode('utf-8')
+
+    module = parso.parse(unicode_bom)
+    endmarker = module.children[0]
+    assert endmarker.type == 'endmarker'
+    assert unicode_bom == endmarker.prefix
+
+    module = parso.parse(unicode_bom + 'foo = 1')
+    expr_stmt = module.children[0]
+    assert expr_stmt.type == 'expr_stmt'
+    assert unicode_bom == expr_stmt.get_first_leaf().prefix