From 1b4c75608ab844f5763e294fec4cba3fbd2dbe10 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Thu, 14 May 2020 23:34:14 +0200 Subject: [PATCH] Fix a python_bytes_to_unicode issue, fixes #107 --- parso/utils.py | 13 +++++++++++-- test/test_utils.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/parso/utils.py b/parso/utils.py index 5a49c32..1485928 100644 --- a/parso/utils.py +++ b/parso/utils.py @@ -105,8 +105,17 @@ def python_bytes_to_unicode(source, encoding='utf-8', errors='strict'): if not isinstance(encoding, unicode): encoding = unicode(encoding, 'utf-8', 'replace') - # Cast to unicode - return unicode(source, encoding, errors) + try: + # Cast to unicode + return unicode(source, encoding, errors) + except LookupError: + if errors == 'replace': + # This is a weird case that can happen if the given encoding is not + # a valid encoding. This usually shouldn't happen with provided + # encodings, but can happen if somebody uses encoding declarations + # like `# coding: foo-8`. + return unicode(source, 'utf-8', errors) + raise def version_info(): diff --git a/test/test_utils.py b/test/test_utils.py index 3078151..44a4ce4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -63,3 +63,17 @@ def test_utf8_bom(): expr_stmt = module.children[0] assert expr_stmt.type == 'expr_stmt' assert unicode_bom == expr_stmt.get_first_leaf().prefix + + +@pytest.mark.parametrize( + ('code', 'errors'), [ + (b'# coding: wtf-12\nfoo', 'strict'), + (b'# coding: wtf-12\nfoo', 'replace'), + ] +) +def test_bytes_to_unicode_failing_encoding(code, errors): + if errors == 'strict': + with pytest.raises(LookupError): + python_bytes_to_unicode(code, errors=errors) + else: + python_bytes_to_unicode(code, errors=errors)