mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-15 17:07:13 +08:00
Add the utf-8 bom to the prefix in the tokenizer.
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
|
from codecs import BOM_UTF8
|
||||||
|
|
||||||
from parso.python.tokenize import group
|
from parso.python.tokenize import group
|
||||||
|
|
||||||
@@ -34,15 +35,18 @@ class PrefixPart(object):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
unicode_bom = BOM_UTF8.decode('utf-8')
|
||||||
|
|
||||||
_comment = r'#[^\n\r\f]*'
|
_comment = r'#[^\n\r\f]*'
|
||||||
_backslash = r'\\\r?\n'
|
_backslash = r'\\\r?\n'
|
||||||
_newline = r'\r?\n'
|
_newline = r'\r?\n'
|
||||||
_form_feed = r'\f'
|
_form_feed = r'\f'
|
||||||
_only_spacing = '$'
|
_only_spacing = '$'
|
||||||
_spacing = r'[ \t]*'
|
_spacing = r'[ \t]*'
|
||||||
|
_bom = unicode_bom
|
||||||
|
|
||||||
_regex = group(
|
_regex = group(
|
||||||
_comment, _backslash, _newline, _form_feed, _only_spacing,
|
_comment, _backslash, _newline, _form_feed, _only_spacing, _bom,
|
||||||
capture=True
|
capture=True
|
||||||
)
|
)
|
||||||
_regex = re.compile(group(_spacing, capture=True) + _regex)
|
_regex = re.compile(group(_spacing, capture=True) + _regex)
|
||||||
@@ -54,6 +58,7 @@ _types = {
|
|||||||
'\f': 'formfeed',
|
'\f': 'formfeed',
|
||||||
'\n': 'newline',
|
'\n': 'newline',
|
||||||
'\r': 'newline',
|
'\r': 'newline',
|
||||||
|
unicode_bom: 'bom'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ import string
|
|||||||
import re
|
import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import itertools as _itertools
|
import itertools as _itertools
|
||||||
|
from codecs import BOM_UTF8
|
||||||
|
|
||||||
from parso.python.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
|
from parso.python.token import (tok_name, N_TOKENS, ENDMARKER, STRING, NUMBER, opmap,
|
||||||
NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
|
NAME, OP, ERRORTOKEN, NEWLINE, INDENT, DEDENT)
|
||||||
@@ -22,6 +23,8 @@ from parso._compatibility import py_version, u
|
|||||||
from parso.utils import splitlines
|
from parso.utils import splitlines
|
||||||
|
|
||||||
|
|
||||||
|
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
|
||||||
|
|
||||||
if py_version >= 30:
|
if py_version >= 30:
|
||||||
# Python 3 has str.isidentifier() to check if a char is a valid identifier
|
# Python 3 has str.isidentifier() to check if a char is a valid identifier
|
||||||
is_identifier = str.isidentifier
|
is_identifier = str.isidentifier
|
||||||
@@ -227,9 +230,15 @@ def tokenize_lines(lines):
|
|||||||
new_line = True
|
new_line = True
|
||||||
prefix = '' # Should never be required, but here for safety
|
prefix = '' # Should never be required, but here for safety
|
||||||
additional_prefix = ''
|
additional_prefix = ''
|
||||||
|
first = True
|
||||||
for lnum, line in enumerate(lines, 1): # loop over lines in stream
|
for lnum, line in enumerate(lines, 1): # loop over lines in stream
|
||||||
pos, max = 0, len(line)
|
if first:
|
||||||
|
if line.startswith(BOM_UTF8_STRING):
|
||||||
|
additional_prefix = BOM_UTF8_STRING
|
||||||
|
line = line[1:]
|
||||||
|
first = False
|
||||||
|
|
||||||
|
pos, max = 0, len(line)
|
||||||
if contstr: # continued string
|
if contstr: # continued string
|
||||||
endmatch = endprog.match(line)
|
endmatch = endprog.match(line)
|
||||||
if endmatch:
|
if endmatch:
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
from itertools import zip_longest
|
from itertools import zip_longest
|
||||||
|
from codecs import BOM_UTF8
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import parso
|
import parso
|
||||||
|
|
||||||
|
unicode_bom = BOM_UTF8.decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(('string', 'tokens'), [
|
@pytest.mark.parametrize(('string', 'tokens'), [
|
||||||
('', ['']),
|
('', ['']),
|
||||||
@@ -48,6 +51,7 @@ def test_simple_prefix_splitting(string, tokens):
|
|||||||
('\\\n', ['backslash', 'spacing']),
|
('\\\n', ['backslash', 'spacing']),
|
||||||
(' \t', ['spacing']),
|
(' \t', ['spacing']),
|
||||||
(' \t ', ['spacing']),
|
(' \t ', ['spacing']),
|
||||||
|
(unicode_bom + ' # ', ['bom', 'comment', 'spacing']),
|
||||||
])
|
])
|
||||||
def test_prefix_splitting_types(string, types):
|
def test_prefix_splitting_types(string, types):
|
||||||
tree = parso.parse(string)
|
tree = parso.parse(string)
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
|
from codecs import BOM_UTF8
|
||||||
|
|
||||||
from parso.utils import splitlines, source_to_unicode
|
from parso.utils import splitlines, source_to_unicode
|
||||||
|
import parso
|
||||||
|
|
||||||
|
|
||||||
def test_splitlines_no_keepends():
|
def test_splitlines_no_keepends():
|
||||||
@@ -25,3 +28,17 @@ def test_source_to_unicode_unicode_text():
|
|||||||
actual = source_to_unicode(source)
|
actual = source_to_unicode(source)
|
||||||
expected = source.decode('utf-8')
|
expected = source.decode('utf-8')
|
||||||
assert actual == expected
|
assert actual == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_utf8_bom():
|
||||||
|
unicode_bom = BOM_UTF8.decode('utf-8')
|
||||||
|
|
||||||
|
module = parso.parse(unicode_bom)
|
||||||
|
endmarker = module.children[0]
|
||||||
|
assert endmarker.type == 'endmarker'
|
||||||
|
assert unicode_bom == endmarker.prefix
|
||||||
|
|
||||||
|
module = parso.parse(unicode_bom + 'foo = 1')
|
||||||
|
expr_stmt = module.children[0]
|
||||||
|
assert expr_stmt.type == 'expr_stmt'
|
||||||
|
assert unicode_bom == expr_stmt.get_first_leaf().prefix
|
||||||
|
|||||||
Reference in New Issue
Block a user