Add the utf-8 bom to the prefix in the tokenizer.

2026-02-08 19:01:25 +08:00 · 2017-07-10 23:26:15 +02:00
parent b3923e65e8
commit 859c48170e
4 changed files with 37 additions and 2 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1,4 +1,7 @@
+from codecs import BOM_UTF8
+
 from parso.utils import splitlines, source_to_unicode
+import parso


 def test_splitlines_no_keepends():
@@ -25,3 +28,17 @@ def test_source_to_unicode_unicode_text():
    actual = source_to_unicode(source)
    expected = source.decode('utf-8')
    assert actual == expected
+
+
+def test_utf8_bom():
+    unicode_bom = BOM_UTF8.decode('utf-8')
+
+    module = parso.parse(unicode_bom)
+    endmarker = module.children[0]
+    assert endmarker.type == 'endmarker'
+    assert unicode_bom == endmarker.prefix
+
+    module = parso.parse(unicode_bom + 'foo = 1')
+    expr_stmt = module.children[0]
+    assert expr_stmt.type == 'expr_stmt'
+    assert unicode_bom == expr_stmt.get_first_leaf().prefix