Fix utf-8 bom positions.

2025-12-06 21:04:29 +08:00 · 2017-07-10 23:38:44 +02:00
parent 859c48170e
commit ff949d1061
2 changed files with 22 additions and 3 deletions
--- a/parso/python/prefix.py
+++ b/parso/python/prefix.py
@@ -3,6 +3,8 @@ from codecs import BOM_UTF8
 from parso.python.tokenize import group
 unicode_bom = BOM_UTF8.decode('utf-8')
 class PrefixPart(object):
    def __init__(self, leaf, typ, value, spacing='', start_pos=None):
@@ -17,6 +19,9 @@ class PrefixPart(object):
    def end_pos(self):
        if self.value.endswith('\n'):
            return self.start_pos[0] + 1, 0
        if self.value == unicode_bom:
            # The bom doesn't have a length at the start of a Python file.
            return self.start_pos
        return self.start_pos[0], self.start_pos[1] + len(self.value)
    def create_spacing_part(self):
@@ -35,8 +40,6 @@ class PrefixPart(object):
        )
 unicode_bom = BOM_UTF8.decode('utf-8')
 _comment = r'#[^\n\r\f]*'
 _backslash = r'\\\r?\n'
 _newline = r'\r?\n'
@@ -66,6 +69,7 @@ def split_prefix(leaf, start_pos):
    line, column = start_pos
    start = 0
    value = spacing = ''
    bom = False
    while start != len(leaf.prefix):
        match =_regex.match(leaf.prefix, start)
        spacing = match.group(1)
@@ -75,8 +79,10 @@ def split_prefix(leaf, start_pos):
        type_ = _types[value[0]]
        yield PrefixPart(
            leaf, type_, value, spacing,
-            start_pos=(line, column + start + len(spacing))
+            start_pos=(line, column + start - int(bom) + len(spacing))
        )
        if type_ == 'bom':
            bom = True
        start = match.end(0)
        if value.endswith('\n'):
--- a/test/test_prefix.py
+++ b/test/test_prefix.py
@@ -59,3 +59,16 @@ def test_prefix_splitting_types(string, types):
    assert leaf.type == 'endmarker'
    parsed_tokens = list(leaf._split_prefix())
    assert [t.type for t in parsed_tokens] == types
 def test_utf8_bom():
    tree = parso.parse(unicode_bom + 'a = 1')
    expr_stmt = tree.children[0]
    assert expr_stmt.start_pos == (1, 0)
    tree = parso.parse(unicode_bom + '\n')
    endmarker = tree.children[0]
    parts = list(endmarker._split_prefix())
    assert [p.type for p in parts] == ['bom', 'newline', 'spacing']
    assert [p.start_pos for p in parts] == [(1, 0), (1, 0), (2, 0)]
    assert [p.end_pos for p in parts] == [(1, 0), (2, 0), (2, 0)]