diff --git a/parso/grammar.py b/parso/grammar.py index 0a72ab6..07cbe25 100644 --- a/parso/grammar.py +++ b/parso/grammar.py @@ -84,7 +84,9 @@ class Grammar(object): if code is None: with open(path, 'rb') as f: - code = source_to_unicode(f.read()) + code = f.read() + + code = source_to_unicode(code) lines = splitlines(code, keepends=True) if diff_cache: diff --git a/parso/utils.py b/parso/utils.py index bfbb2a1..904fef5 100644 --- a/parso/utils.py +++ b/parso/utils.py @@ -48,7 +48,10 @@ def splitlines(string, keepends=False): return re.split('\n|\r\n', string) -def source_to_unicode(source, encoding=None): +def source_to_unicode(source, default_encoding='utf-8', errors='strict'): + """ + `errors` can be 'strict', 'replace' or 'ignore'. + """ def detect_encoding(): """ For the implementation of encoding definitions in Python, look at: @@ -67,7 +70,7 @@ def source_to_unicode(source, encoding=None): return possible_encoding.group(1) else: # the default if nothing else has been set -> PEP 263 - return encoding if encoding is not None else 'utf-8' + return default_encoding if isinstance(source, unicode): # only cast str/bytes @@ -76,8 +79,9 @@ def source_to_unicode(source, encoding=None): encoding = detect_encoding() if not isinstance(encoding, unicode): encoding = unicode(encoding, 'utf-8', 'replace') - # cast to unicode by default - return unicode(source, encoding, 'replace') + + # Cast to unicode + return unicode(source, encoding, errors) def version_info(): diff --git a/test/test_grammar.py b/test/test_grammar.py new file mode 100644 index 0000000..60a249b --- /dev/null +++ b/test/test_grammar.py @@ -0,0 +1,8 @@ +import parso + +import pytest + + +def test_non_unicode(): + with pytest.raises(UnicodeDecodeError): + parso.parse(b'\xe4')