From 61b94ae593e2e7962e74fd5412f87d0bfff811c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Helleu?= Date: Wed, 23 Oct 2024 18:05:16 +0200 Subject: [PATCH] Fix UnicodeDecodeError in case of invalid UTF-8 in input file --- CHANGELOG.md | 4 ++++ msgcheck/po.py | 2 +- tests/fr_invalid_utf8.po | 40 ++++++++++++++++++++++++++++++++++++++++ tests/test_msgcheck.py | 13 +++++++++++++ 4 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 tests/fr_invalid_utf8.po diff --git a/CHANGELOG.md b/CHANGELOG.md index 81388e5..cfa06d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ - Use file README.md as package long description +### Fixed + +- Fix UnicodeDecodeError in case of invalid UTF-8 in input file + ## Version 4.0.0 (2022-01-23) ### Changed diff --git a/msgcheck/po.py b/msgcheck/po.py index 2249275..c229796 100644 --- a/msgcheck/po.py +++ b/msgcheck/po.py @@ -468,7 +468,7 @@ def read(self): # pylint: disable=too-many-locals """ self.msgs = [] checker = Checker() - with open(self.filename, "r", encoding="utf-8") as po_file: + with open(self.filename, "r", encoding="utf-8", errors="ignore") as po_file: for line in po_file: message = checker.check_line(line.strip()) if message: diff --git a/tests/fr_invalid_utf8.po b/tests/fr_invalid_utf8.po new file mode 100644 index 0000000..12a243e --- /dev/null +++ b/tests/fr_invalid_utf8.po @@ -0,0 +1,40 @@ +# +# Copyright (C) 2024 Sébastien Helleu +# +# This file is part of msgcheck. +# +# Msgcheck is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# Msgcheck is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with msgcheck. If not, see . +# + +# +# Gettext file with invalid UTF-8 chars. +# + +msgid "" +msgstr "" +"Project-Id-Version: msgcheck\n" +"Report-Msgid-Bugs-To: flashcode@flashtux.org\n" +"POT-Creation-Date: 2014-05-03 12:00+0200\n" +"PO-Revision-Date: 2024-09-12 17:02+0200\n" +"Last-Translator: Sébastien Helleu \n" +"Language-Team: flashcode@flashtux.org\n" +"Language: fr\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=iso-8859-13\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=2; plural=(n > 1);\n" + +# Normal string with special chars +msgid "id-õäöü" +msgstr "str-þð" diff --git a/tests/test_msgcheck.py b/tests/test_msgcheck.py index 87b72ba..b29ba5d 100644 --- a/tests/test_msgcheck.py +++ b/tests/test_msgcheck.py @@ -434,3 +434,16 @@ def test_punct_full_stop_ja_zh(language, msgid, msgstr, error_message): assert error_message in errors[0].message else: assert not errors + + +def test_invalid_utf8(): + """Test checks on a file with invalid UTF-8 chars.""" + po_check = PoCheck() + po_check.set_check("fuzzy", True) + result = po_check.check_files([local_path("fr_invalid_utf8.po")]) + + # be sure we have one file in result + assert len(result) == 1 + + # the file has no errors + assert len(result[0][1]) == 0