diff --git a/parso/python/prefix.py b/parso/python/prefix.py index 9cf5712..b7f1e1b 100644 --- a/parso/python/prefix.py +++ b/parso/python/prefix.py @@ -3,6 +3,8 @@ from codecs import BOM_UTF8 from parso.python.tokenize import group +unicode_bom = BOM_UTF8.decode('utf-8') + class PrefixPart(object): def __init__(self, leaf, typ, value, spacing='', start_pos=None): @@ -17,6 +19,9 @@ class PrefixPart(object): def end_pos(self): if self.value.endswith('\n'): return self.start_pos[0] + 1, 0 + if self.value == unicode_bom: + # The bom doesn't have a length at the start of a Python file. + return self.start_pos return self.start_pos[0], self.start_pos[1] + len(self.value) def create_spacing_part(self): @@ -35,8 +40,6 @@ class PrefixPart(object): ) -unicode_bom = BOM_UTF8.decode('utf-8') - _comment = r'#[^\n\r\f]*' _backslash = r'\\\r?\n' _newline = r'\r?\n' @@ -66,6 +69,7 @@ def split_prefix(leaf, start_pos): line, column = start_pos start = 0 value = spacing = '' + bom = False while start != len(leaf.prefix): match =_regex.match(leaf.prefix, start) spacing = match.group(1) @@ -75,8 +79,10 @@ def split_prefix(leaf, start_pos): type_ = _types[value[0]] yield PrefixPart( leaf, type_, value, spacing, - start_pos=(line, column + start + len(spacing)) + start_pos=(line, column + start - int(bom) + len(spacing)) ) + if type_ == 'bom': + bom = True start = match.end(0) if value.endswith('\n'): diff --git a/test/test_prefix.py b/test/test_prefix.py index a628b9f..369e6c8 100644 --- a/test/test_prefix.py +++ b/test/test_prefix.py @@ -59,3 +59,16 @@ def test_prefix_splitting_types(string, types): assert leaf.type == 'endmarker' parsed_tokens = list(leaf._split_prefix()) assert [t.type for t in parsed_tokens] == types + + +def test_utf8_bom(): + tree = parso.parse(unicode_bom + 'a = 1') + expr_stmt = tree.children[0] + assert expr_stmt.start_pos == (1, 0) + + tree = parso.parse(unicode_bom + '\n') + endmarker = tree.children[0] + parts = list(endmarker._split_prefix()) + assert [p.type for p in parts] == ['bom', 'newline', 'spacing'] + assert [p.start_pos for p in parts] == [(1, 0), (1, 0), (2, 0)] + assert [p.end_pos for p in parts] == [(1, 0), (2, 0), (2, 0)]