From 1ed2af1a6900a3b1c1950cb5f10a39c73d1530f5 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 12 Apr 2024 13:25:01 +0200 Subject: [PATCH] Fix get_content_as_text for compressed text datatypes Fixes: ``` UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte File "starlette/applications.py", line 123, in __call__ await self.middleware_stack(scope, receive, send) File "starlette/middleware/errors.py", line 186, in __call__ raise exc File "starlette/middleware/errors.py", line 164, in __call__ await self.app(scope, receive, _send) File "starlette_context/middleware/raw_middleware.py", line 92, in __call__ await self.app(scope, receive, send_wrapper) File "starlette/middleware/base.py", line 189, in __call__ with collapse_excgroups(): File "contextlib.py", line 155, in __exit__ self.gen.throw(typ, value, traceback) File "starlette/_utils.py", line 93, in collapse_excgroups raise exc File "starlette/middleware/base.py", line 191, in __call__ response = await self.dispatch_func(request, call_next) File "galaxy/webapps/galaxy/fast_app.py", line 108, in add_x_frame_options response = await call_next(request) File "starlette/middleware/base.py", line 165, in call_next raise app_exc File "starlette/middleware/base.py", line 151, in coro await self.app(scope, receive_or_disconnect, send_no_error) File "starlette/middleware/exceptions.py", line 62, in __call__ await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send) File "starlette/_exception_handler.py", line 64, in wrapped_app raise exc File "starlette/_exception_handler.py", line 53, in wrapped_app await app(scope, receive, sender) File "starlette/routing.py", line 758, in __call__ await self.middleware_stack(scope, receive, send) File "starlette/routing.py", line 778, in app await route.handle(scope, receive, send) File "starlette/routing.py", line 299, in handle await self.app(scope, receive, send) File "starlette/routing.py", line 79, in app await wrap_app_handling_exceptions(app, request)(scope, receive, send) File "starlette/_exception_handler.py", line 64, in wrapped_app raise exc File "starlette/_exception_handler.py", line 53, in wrapped_app await app(scope, receive, sender) File "starlette/routing.py", line 74, in app response = await func(request) File "fastapi/routing.py", line 278, in app raw_response = await run_endpoint_function( File "fastapi/routing.py", line 193, in run_endpoint_function return await run_in_threadpool(dependant.call, **values) File "starlette/concurrency.py", line 42, in run_in_threadpool return await anyio.to_thread.run_sync(func, *args) File "anyio/to_thread.py", line 56, in run_sync return await get_async_backend().run_sync_in_worker_thread( File "anyio/_backends/_asyncio.py", line 2144, in run_sync_in_worker_thread return await future File "anyio/_backends/_asyncio.py", line 851, in run result = context.run(func, *args) File "galaxy/webapps/galaxy/api/datasets.py", line 192, in get_content_as_text return self.service.get_content_as_text(trans, dataset_id) File "galaxy/webapps/galaxy/services/datasets.py", line 643, in get_content_as_text truncated, dataset_data = self.hda_manager.text_data(hda, preview=True) File "galaxy/managers/hdas.py", line 310, in text_data hda_data = open(hda.get_file_name()).read(MAX_PEEK_SIZE) File "", line 322, in decode ``` from https://sentry.galaxyproject.org/share/issue/9eb8e5b692b94700ac9b304b6d1c2418/ --- lib/galaxy/managers/hdas.py | 9 ++++++--- lib/galaxy_test/api/test_datasets.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/lib/galaxy/managers/hdas.py b/lib/galaxy/managers/hdas.py index e600ab311cd2..3be812fcf0e8 100644 --- a/lib/galaxy/managers/hdas.py +++ b/lib/galaxy/managers/hdas.py @@ -68,6 +68,7 @@ MinimalManagerApp, StructuredApp, ) +from galaxy.util.compression_utils import get_fileobj log = logging.getLogger(__name__) @@ -303,11 +304,13 @@ def text_data(self, hda, preview=True): # For now, cannot get data from non-text datasets. if not isinstance(hda.datatype, datatypes.data.Text): return truncated, hda_data - if not os.path.exists(hda.get_file_name()): + file_path = hda.get_file_name() + if not os.path.exists(file_path): return truncated, hda_data - truncated = preview and os.stat(hda.get_file_name()).st_size > MAX_PEEK_SIZE - hda_data = open(hda.get_file_name()).read(MAX_PEEK_SIZE) + truncated = preview and os.stat(file_path).st_size > MAX_PEEK_SIZE + with get_fileobj(file_path) as fh: + hda_data = fh.read(MAX_PEEK_SIZE) return truncated, hda_data # .... annotatable diff --git a/lib/galaxy_test/api/test_datasets.py b/lib/galaxy_test/api/test_datasets.py index fdd139d78640..3c8c9daf3420 100644 --- a/lib/galaxy_test/api/test_datasets.py +++ b/lib/galaxy_test/api/test_datasets.py @@ -12,6 +12,7 @@ one_hda_model_store_dict, TEST_SOURCE_URI, ) +from galaxy.tool_util.verify.test_data import TestDataResolver from galaxy.util.unittest_utils import skip_if_github_down from galaxy_test.base.api_asserts import assert_has_keys from galaxy_test.base.decorators import ( @@ -356,6 +357,15 @@ def test_get_content_as_text(self, history_id): self._assert_has_key(get_content_as_text_response.json(), "item_data") assert get_content_as_text_response.json().get("item_data") == contents + def test_get_content_as_text_with_compressed_text_data(self, history_id): + test_data_resolver = TestDataResolver() + with open(test_data_resolver.get_filename("1.fasta.gz"), mode="rb") as fh: + hda1 = self.dataset_populator.new_dataset(history_id, content=fh, ftype="fasta.gz", wait=True) + get_content_as_text_response = self._get(f"datasets/{hda1['id']}/get_content_as_text") + self._assert_status_code_is(get_content_as_text_response, 200) + self._assert_has_key(get_content_as_text_response.json(), "item_data") + assert ">hg17" in get_content_as_text_response.json().get("item_data") + def test_anon_get_content_as_text(self, history_id): contents = "accessible data" hda1 = self.dataset_populator.new_dataset(history_id, content=contents, wait=True)