From 26e612e21e18e7083a422e165c32699b04fdf0e9 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 23 Oct 2023 14:38:27 -0400 Subject: [PATCH 1/6] fix: secure filename + keep filename if uploading file bytes --- chord_drs/models.py | 13 +++++++++---- chord_drs/routes.py | 5 ++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/chord_drs/models.py b/chord_drs/models.py index 906f6cc..11f66dc 100644 --- a/chord_drs/models.py +++ b/chord_drs/models.py @@ -4,6 +4,7 @@ from pathlib import Path from sqlalchemy.sql import func from sqlalchemy.orm import relationship +from werkzeug.utils import secure_filename from urllib.parse import urlparse from uuid import uuid4 @@ -71,6 +72,9 @@ def __init__(self, *args, **kwargs): # If set, we are deduplicating with an existing file object object_to_copy: DrsBlob | None = kwargs.get("object_to_copy") + # If set, we are overriding the filename to save the file to + filename: str | None = kwargs.get("filename") + self.id = str(uuid4()) if object_to_copy: @@ -88,8 +92,8 @@ def __init__(self, *args, **kwargs): # TODO: we will need to account for URLs at some point raise FileNotFoundError("Provided file path does not exists") - self.name = p.name - new_filename = f"{self.id[:12]}-{p.name}" # TODO: use checksum for filename instead + self.name = secure_filename(filename or p.name) + new_filename = f"{self.id[:12]}-{self.name}" # TODO: use checksum for filename instead backend = get_backend() @@ -104,8 +108,9 @@ def __init__(self, *args, **kwargs): # TODO: implement more specific exception handling raise Exception("Well if the file is not saved... we can't do squat") - if "location" in kwargs: - del kwargs["location"] + for key_to_remove in ("location", "filename"): + if key_to_remove in kwargs: + del kwargs[key_to_remove] super().__init__(*args, **kwargs) diff --git a/chord_drs/routes.py b/chord_drs/routes.py index 6c1bdce..e223c44 100644 --- a/chord_drs/routes.py +++ b/chord_drs/routes.py @@ -473,9 +473,11 @@ def object_ingest(): tfh, t_obj_path = tempfile.mkstemp() try: - if file: + filename: str | None = None # no override, use path filename if path is specified instead of a file upload + if file is not None: file.save(t_obj_path) obj_path = t_obj_path + filename = file.filename # still may be none, in which case the temporary filename will be used if deduplicate: # Get checksum of original file, and query database for objects that match @@ -507,6 +509,7 @@ def object_ingest(): try: drs_object = DrsBlob( **(dict(object_to_copy=object_to_copy) if object_to_copy else dict(location=obj_path)), + filename=filename, project_id=project_id, dataset_id=dataset_id, data_type=data_type, From f50d4422927dbb699f541f559e236b02dfe406cf Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 23 Oct 2023 14:39:16 -0400 Subject: [PATCH 2/6] chore: bump version to 0.13.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c247ec6..dee4b22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "chord-drs" -version = "0.13.0" +version = "0.13.1" description = "An implementation of a data repository system (as per GA4GH's specs) for the Bento platform." authors = ["David Lougheed "] license = "LGPL-3.0" From 2403a0bf35b842021657004de8925a3924577ef2 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 23 Oct 2023 15:53:25 -0400 Subject: [PATCH 3/6] chore: revert version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index dee4b22..c247ec6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "chord-drs" -version = "0.13.1" +version = "0.13.0" description = "An implementation of a data repository system (as per GA4GH's specs) for the Bento platform." authors = ["David Lougheed "] license = "LGPL-3.0" From e42a1aef505e89a466328564ea6463305d647570 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 25 Oct 2023 10:40:22 -0400 Subject: [PATCH 4/6] chore: add logging to DRS ingestion --- chord_drs/models.py | 6 +++++- chord_drs/routes.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/chord_drs/models.py b/chord_drs/models.py index 11f66dc..903c248 100644 --- a/chord_drs/models.py +++ b/chord_drs/models.py @@ -69,6 +69,8 @@ class DrsBlob(db.Model, DrsMixin): location = db.Column(db.String(500), nullable=False) def __init__(self, *args, **kwargs): + logger = current_app.logger + # If set, we are deduplicating with an existing file object object_to_copy: DrsBlob | None = kwargs.get("object_to_copy") @@ -104,10 +106,12 @@ def __init__(self, *args, **kwargs): self.size = os.path.getsize(p) self.checksum = drs_file_checksum(location) except Exception as e: - current_app.logger.error(f"Encountered exception during DRS object creation: {e}") + logger.error(f"Encountered exception during DRS object creation: {e}") # TODO: implement more specific exception handling raise Exception("Well if the file is not saved... we can't do squat") + logger.info(f"Creating new DRS object: name={self.name}; size={self.size}; sha256={self.checksum}") + for key_to_remove in ("location", "filename"): if key_to_remove in kwargs: del kwargs[key_to_remove] diff --git a/chord_drs/routes.py b/chord_drs/routes.py index e223c44..a1be118 100644 --- a/chord_drs/routes.py +++ b/chord_drs/routes.py @@ -475,6 +475,7 @@ def object_ingest(): try: filename: str | None = None # no override, use path filename if path is specified instead of a file upload if file is not None: + logger.debug(f"ingest - recieved file object: {file}") file.save(t_obj_path) obj_path = t_obj_path filename = file.filename # still may be none, in which case the temporary filename will be used From dabb039dc3e4e6e3533238b5a4d3e9aac2c5a57a Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 25 Oct 2023 11:15:04 -0400 Subject: [PATCH 5/6] chore: if filename is set when creating duped DrsBlob, override orig. filename --- chord_drs/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chord_drs/models.py b/chord_drs/models.py index 903c248..7f7ac6f 100644 --- a/chord_drs/models.py +++ b/chord_drs/models.py @@ -80,7 +80,7 @@ def __init__(self, *args, **kwargs): self.id = str(uuid4()) if object_to_copy: - self.name = object_to_copy.name + self.name = secure_filename(filename) if filename else object_to_copy.name self.location = object_to_copy.location self.size = object_to_copy.size self.checksum = object_to_copy.checksum From f0ba98cb79079837f24f551cb52e569b5f3b7441 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 25 Oct 2023 11:15:16 -0400 Subject: [PATCH 6/6] test: fix tests --- tests/test_models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 4ac2100..649d9ba 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -4,10 +4,12 @@ def test_drs_blob_init_bad_file(): + from chord_drs.app import application from chord_drs.models import DrsBlob - with pytest.raises(FileNotFoundError): - DrsBlob(location="path/to/dne") + with application.app_context(): + with pytest.raises(FileNotFoundError): + DrsBlob(location="path/to/dne") def test_drs_blob_init():