From 149ab237b244fa0b9e3a31f73a9b067372081ddc Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 5 Jul 2024 16:27:07 +0200 Subject: [PATCH] Raise Message Exception when displaying binary data Fixes https://sentry.galaxyproject.org/share/issue/a8843884527f4e4089b32fd14a2f126d/: ``` UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 4: invalid start byte File "galaxy/web/framework/middleware/error.py", line 167, in __call__ app_iter = self.application(environ, sr_checker) File "galaxy/web/framework/middleware/statsd.py", line 29, in __call__ req = self.application(environ, start_response) File "/cvmfs/main.galaxyproject.org/venv/lib/python3.11/site-packages/paste/httpexceptions.py", line 635, in __call__ return self.application(environ, start_response) File "galaxy/web/framework/base.py", line 174, in __call__ return self.handle_request(request_id, path_info, environ, start_response) File "galaxy/web/framework/base.py", line 263, in handle_request body = method(trans, **kwargs) File "galaxy/webapps/galaxy/controllers/dataset.py", line 152, in display display_data, headers = data.datatype.display_data( File "galaxy/datatypes/sequence.py", line 785, in display_data "/dataset/large_file.mako", truncated_data=fh.read(max_peek_size), data=dataset File "", line 322, in decode ``` Which is a BAM file assigned to fastqsanger.gz --- lib/galaxy/datatypes/sequence.py | 15 +++++++++------ lib/galaxy/datatypes/tabular.py | 14 +++++++++----- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/lib/galaxy/datatypes/sequence.py b/lib/galaxy/datatypes/sequence.py index 911a1a2db31a..7a721d8df465 100644 --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -41,6 +41,7 @@ get_headers, iter_headers, ) +from galaxy.exceptions import InvalidFileFormatError from galaxy.util import ( compression_utils, nice_size, @@ -775,15 +776,17 @@ def display_data( headers = kwd.get("headers", {}) if preview: with compression_utils.get_fileobj(dataset.get_file_name()) as fh: - max_peek_size = 1000000 # 1 MB - if os.stat(dataset.get_file_name()).st_size < max_peek_size: + max_peek_size = 100000 + try: + chunk = fh.read(max_peek_size + 1) + except UnicodeDecodeError: + raise InvalidFileFormatError("Dataset appears to contain binary data, cannot display.") + if len(chunk) <= max_peek_size: mime = "text/plain" self._clean_and_set_mime_type(trans, mime, headers) - return fh.read(), headers + return chunk[:-1], headers return ( - trans.fill_template_mako( - "/dataset/large_file.mako", truncated_data=fh.read(max_peek_size), data=dataset - ), + trans.fill_template_mako("/dataset/large_file.mako", truncated_data=chunk[:-1], data=dataset), headers, ) else: diff --git a/lib/galaxy/datatypes/tabular.py b/lib/galaxy/datatypes/tabular.py index 5b4e1d523e12..db9ef1293294 100644 --- a/lib/galaxy/datatypes/tabular.py +++ b/lib/galaxy/datatypes/tabular.py @@ -65,6 +65,7 @@ iter_headers, validate_tabular, ) +from galaxy.exceptions import InvalidFileFormatError from galaxy.util import compression_utils from galaxy.util.compression_utils import ( FileObjType, @@ -156,12 +157,15 @@ def get_chunk(self, trans, dataset: HasFileName, offset: int = 0, ck_size: Optio def _read_chunk(self, trans, dataset: HasFileName, offset: int, ck_size: Optional[int] = None): with compression_utils.get_fileobj(dataset.get_file_name()) as f: f.seek(offset) - ck_data = f.read(ck_size or trans.app.config.display_chunk_size) - if ck_data and ck_data[-1] != "\n": - cursor = f.read(1) - while cursor and cursor != "\n": - ck_data += cursor + try: + ck_data = f.read(ck_size or trans.app.config.display_chunk_size) + if ck_data and ck_data[-1] != "\n": cursor = f.read(1) + while cursor and cursor != "\n": + ck_data += cursor + cursor = f.read(1) + except UnicodeDecodeError: + raise InvalidFileFormatError("Dataset appears to contain binary data, cannot display.") last_read = f.tell() return ck_data, last_read