From 1a61c12f3a729529c6dc70fa865512bfe75704e5 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 15 Apr 2024 15:44:26 -0400 Subject: [PATCH 01/11] refact: move serialization functions to their own file --- chord_drs/routes.py | 139 +--------------------------------- chord_drs/serialization.py | 149 +++++++++++++++++++++++++++++++++++++ chord_drs/types.py | 5 ++ 3 files changed, 156 insertions(+), 137 deletions(-) create mode 100644 chord_drs/serialization.py diff --git a/chord_drs/routes.py b/chord_drs/routes.py index 6d25a9d..cf9f5a4 100644 --- a/chord_drs/routes.py +++ b/chord_drs/routes.py @@ -13,22 +13,19 @@ Request, current_app, jsonify, - url_for, request, send_file, make_response, ) from sqlalchemy import or_ -from urllib.parse import urlparse from werkzeug.exceptions import BadRequest, Forbidden, NotFound, InternalServerError, RequestedRangeNotSatisfiable from . import __version__ from .authz import authz_middleware from .constants import BENTO_SERVICE_KIND, SERVICE_NAME, SERVICE_TYPE -from .data_sources import DATA_SOURCE_LOCAL, DATA_SOURCE_MINIO from .db import db -from .models import DrsMixin, DrsBlob, DrsBundle -from .types import DRSAccessMethodDict, DRSContentsDict, DRSObjectBentoDict, DRSObjectDict +from .models import DrsBlob, DrsBundle +from .serialization import build_bundle_json, build_blob_json from .utils import drs_file_checksum @@ -119,138 +116,6 @@ def range_not_satisfiable_log_mark(description: str, length: int) -> RequestedRa return RequestedRangeNotSatisfiable(description=description, length=length) -def get_drs_host() -> str: - return urlparse(current_app.config["SERVICE_BASE_URL"]).netloc - - -def create_drs_uri(object_id: str) -> str: - return f"drs://{get_drs_host()}/{object_id}" - - -def build_contents(bundle: DrsBundle, expand: bool) -> list[DRSContentsDict]: - content: list[DRSContentsDict] = [] - bundles = DrsBundle.query.filter_by(parent_bundle=bundle).all() - - for b in bundles: - content.append( - { - **({"contents": build_contents(b, expand)} if expand else {}), - "drs_uri": create_drs_uri(b.id), - "id": b.id, - "name": b.name, # TODO: Can overwrite... see spec - } - ) - - for c in bundle.objects: - content.append( - { - "drs_uri": create_drs_uri(c.id), - "id": c.id, - "name": c.name, # TODO: Can overwrite... see spec - } - ) - - return content - - -def build_bento_object_json(drs_object: DrsMixin) -> DRSObjectBentoDict: - return { - "bento": { - "project_id": drs_object.project_id, - "dataset_id": drs_object.dataset_id, - "data_type": drs_object.data_type, - "public": drs_object.public, - } - } - - -def build_bundle_json( - drs_bundle: DrsBundle, - expand: bool = False, - with_bento_properties: bool = False, -) -> DRSObjectDict: - return { - "contents": build_contents(drs_bundle, expand), - "checksums": [ - { - "checksum": drs_bundle.checksum, - "type": "sha-256", - }, - ], - "created_time": f"{drs_bundle.created.isoformat('T')}Z", - "size": drs_bundle.size, - "name": drs_bundle.name, - # Description should be excluded if null in the database - **({"description": drs_bundle.description} if drs_bundle.description is not None else {}), - "id": drs_bundle.id, - "self_uri": create_drs_uri(drs_bundle.id), - **(build_bento_object_json(drs_bundle) if with_bento_properties else {}), - } - - -def build_blob_json( - drs_blob: DrsBlob, - inside_container: bool = False, - with_bento_properties: bool = False, -) -> DRSObjectDict: - data_source = current_app.config["SERVICE_DATA_SOURCE"] - - blob_url: str = urllib.parse.urljoin( - current_app.config["SERVICE_BASE_URL"] + "/", - url_for("drs_service.object_download", object_id=drs_blob.id).lstrip("/"), - ) - - https_access_method: DRSAccessMethodDict = { - "access_url": { - # url_for external was giving weird results - build the URL by hand instead using the internal url_for - "url": blob_url, - # No headers --> auth will have to be obtained via some - # out-of-band method, or the object's contents are public. This - # will depend on how the service is deployed. - }, - "type": "https", - } - - access_methods: list[DRSAccessMethodDict] = [https_access_method] - - if inside_container and data_source == DATA_SOURCE_LOCAL: - access_methods.append( - { - "access_url": { - "url": f"file://{drs_blob.location}", - }, - "type": "file", - } - ) - elif data_source == DATA_SOURCE_MINIO: - access_methods.append( - { - "access_url": { - "url": drs_blob.location, - }, - "type": "s3", - } - ) - - return { - "access_methods": access_methods, - "checksums": [ - { - "checksum": drs_blob.checksum, - "type": "sha-256", - }, - ], - "created_time": f"{drs_blob.created.isoformat('T')}Z", - "size": drs_blob.size, - "name": drs_blob.name, - # Description should be excluded if null in the database - **({"description": drs_blob.description} if drs_blob.description is not None else {}), - "id": drs_blob.id, - "self_uri": create_drs_uri(drs_blob.id), - **(build_bento_object_json(drs_blob) if with_bento_properties else {}), - } - - @drs_service.route("/service-info", methods=["GET"]) @drs_service.route("/ga4gh/drs/v1/service-info", methods=["GET"]) @authz_middleware.deco_public_endpoint diff --git a/chord_drs/serialization.py b/chord_drs/serialization.py new file mode 100644 index 0000000..5bbac99 --- /dev/null +++ b/chord_drs/serialization.py @@ -0,0 +1,149 @@ +import urllib.parse + +from flask import ( + current_app, + url_for, +) +from urllib.parse import urlparse + +from .data_sources import DATA_SOURCE_LOCAL, DATA_SOURCE_MINIO +from .models import DrsMixin, DrsBlob, DrsBundle +from .types import DRSAccessMethodDict, DRSContentsDict, DRSObjectBentoMetaDict, DRSObjectDict + + +__all__ = [ + "build_bundle_json", + "build_blob_json", +] + + +def get_drs_host() -> str: + return urlparse(current_app.config["SERVICE_BASE_URL"]).netloc + + +def create_drs_uri(object_id: str) -> str: + return f"drs://{get_drs_host()}/{object_id}" + + +def build_contents(bundle: DrsBundle, expand: bool) -> list[DRSContentsDict]: + content: list[DRSContentsDict] = [] + bundles = DrsBundle.query.filter_by(parent_bundle=bundle).all() + + for b in bundles: + content.append( + { + **({"contents": build_contents(b, expand)} if expand else {}), + "drs_uri": create_drs_uri(b.id), + "id": b.id, + "name": b.name, # TODO: Can overwrite... see spec + } + ) + + for c in bundle.objects: + content.append( + { + "drs_uri": create_drs_uri(c.id), + "id": c.id, + "name": c.name, # TODO: Can overwrite... see spec + } + ) + + return content + + +def build_bento_object_json(drs_object: DrsMixin) -> DRSObjectBentoMetaDict: + return { + "bento": { + "project_id": drs_object.project_id, + "dataset_id": drs_object.dataset_id, + "data_type": drs_object.data_type, + "public": drs_object.public, + } + } + + +def build_bundle_json( + drs_bundle: DrsBundle, + expand: bool = False, + with_bento_properties: bool = False, +) -> DRSObjectDict: + return { + "contents": build_contents(drs_bundle, expand), + "checksums": [ + { + "checksum": drs_bundle.checksum, + "type": "sha-256", + }, + ], + "created_time": f"{drs_bundle.created.isoformat('T')}Z", + "size": drs_bundle.size, + "name": drs_bundle.name, + # Description should be excluded if null in the database + **({"description": drs_bundle.description} if drs_bundle.description is not None else {}), + "id": drs_bundle.id, + "self_uri": create_drs_uri(drs_bundle.id), + **(build_bento_object_json(drs_bundle) if with_bento_properties else {}), + } + + +def build_blob_json( + drs_blob: DrsBlob, + inside_container: bool = False, + with_bento_properties: bool = False, +) -> DRSObjectDict: + data_source = current_app.config["SERVICE_DATA_SOURCE"] + + blob_url: str = urllib.parse.urljoin( + current_app.config["SERVICE_BASE_URL"] + "/", + url_for("drs_service.object_download", object_id=drs_blob.id).lstrip("/"), + ) + + https_access_method: DRSAccessMethodDict = { + "access_url": { + # url_for external was giving weird results - build the URL by hand instead using the internal url_for + "url": blob_url, + # No headers --> auth will have to be obtained via some + # out-of-band method, or the object's contents are public. This + # will depend on how the service is deployed. + }, + "type": "https", + } + + access_methods: list[DRSAccessMethodDict] = [https_access_method] + + if inside_container and data_source == DATA_SOURCE_LOCAL: + access_methods.append( + { + "access_url": { + "url": f"file://{drs_blob.location}", + }, + "type": "file", + } + ) + elif data_source == DATA_SOURCE_MINIO: + access_methods.append( + { + "access_url": { + "url": drs_blob.location, + }, + "type": "s3", + } + ) + + return { + "access_methods": access_methods, + "checksums": [ + { + "checksum": drs_blob.checksum, + "type": "sha-256", + }, + ], + "created_time": f"{drs_blob.created.isoformat('T')}Z", + "size": drs_blob.size, + "name": drs_blob.name, + # Description should be excluded if null in the database + **({"description": drs_blob.description} if drs_blob.description is not None else {}), + "id": drs_blob.id, + "self_uri": create_drs_uri(drs_blob.id), + **(build_bento_object_json(drs_blob) if with_bento_properties else {}), + } diff --git a/chord_drs/types.py b/chord_drs/types.py index 391e99b..46eb25b 100644 --- a/chord_drs/types.py +++ b/chord_drs/types.py @@ -6,6 +6,7 @@ "DRSChecksumDict", "DRSContentsDict", "DRSObjectBentoDict", + "DRSObjectBentoMetaDict", "DRSObjectDict", ] @@ -60,6 +61,10 @@ class DRSObjectBentoDict(TypedDict): public: bool +class DRSObjectBentoMetaDict(TypedDict): + bento: DRSObjectBentoDict + + class DRSObjectDict(_DRSObjectDictBase, total=False): access_methods: list[DRSAccessMethodDict] name: str From 9087f47c95f25b678bee2137ed816c0597c893e2 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 15 Apr 2024 16:20:30 -0400 Subject: [PATCH 02/11] feat: add object delete functionality --- chord_drs/backends/base.py | 7 +++++++ chord_drs/backends/local.py | 7 +++++++ chord_drs/backends/minio.py | 4 ++++ chord_drs/routes.py | 38 +++++++++++++++++++++++++++++++------ 4 files changed, 50 insertions(+), 6 deletions(-) diff --git a/chord_drs/backends/base.py b/chord_drs/backends/base.py index a05f4a1..31ee67b 100644 --- a/chord_drs/backends/base.py +++ b/chord_drs/backends/base.py @@ -9,6 +9,10 @@ class Backend(ABC): def save(self, current_location: str, filename: str) -> str: # pragma: no cover pass + @abstractmethod + def delete(self, location: str) -> None: # pragma: no cover + pass + class FakeBackend(Backend): """ @@ -17,3 +21,6 @@ class FakeBackend(Backend): def save(self, current_location: str, filename: str) -> str: return current_location + + def delete(self, location: str): + return None diff --git a/chord_drs/backends/local.py b/chord_drs/backends/local.py index 2209478..9dabbd8 100644 --- a/chord_drs/backends/local.py +++ b/chord_drs/backends/local.py @@ -24,3 +24,10 @@ def save(self, current_location: str | Path, filename: str) -> str: new_location = self.base_location / filename copy(current_location, new_location) return str(new_location.resolve()) + + def delete(self, location: str | Path) -> None: + loc = location if isinstance(location, Path) else Path(location) + if self.base_location in loc.parents: + loc.unlink() + return + raise ValueError(f"Location {loc} is not a subpath of backend base location {self.base_location}") diff --git a/chord_drs/backends/minio.py b/chord_drs/backends/minio.py index 1c05788..4f72dce 100644 --- a/chord_drs/backends/minio.py +++ b/chord_drs/backends/minio.py @@ -33,3 +33,7 @@ def save(self, current_location: str, filename: str) -> str: with open(current_location, "rb") as f: obj = self.bucket.put_object(Key=filename, Body=f) return MinioBackend.build_minio_location(obj) + + def delete(self, location: str) -> None: + obj = self.get_minio_object(location) + obj.delete() diff --git a/chord_drs/routes.py b/chord_drs/routes.py index cf9f5a4..b1dac43 100644 --- a/chord_drs/routes.py +++ b/chord_drs/routes.py @@ -4,7 +4,7 @@ import urllib.parse from asgiref.sync import async_to_sync -from bento_lib.auth.permissions import Permission, P_INGEST_DATA, P_QUERY_DATA, P_DOWNLOAD_DATA +from bento_lib.auth.permissions import Permission, P_INGEST_DATA, P_QUERY_DATA, P_DELETE_DATA, P_DOWNLOAD_DATA from bento_lib.auth.resources import RESOURCE_EVERYTHING, build_resource from bento_lib.service_info.constants import SERVICE_ORGANIZATION_C3G from bento_lib.service_info.helpers import build_service_info @@ -22,6 +22,7 @@ from . import __version__ from .authz import authz_middleware +from .backend import get_backend from .constants import BENTO_SERVICE_KIND, SERVICE_NAME, SERVICE_TYPE from .db import db from .models import DrsBlob, DrsBundle @@ -81,18 +82,18 @@ def _post_headers_getter(r: Request) -> dict[str, str]: def fetch_and_check_object_permissions(object_id: str, permission: Permission) -> tuple[DrsBlob | DrsBundle, bool]: - view_data_everything = check_everything_permission(permission) + has_permission_on_everything = check_everything_permission(permission) drs_object, is_bundle = get_drs_object(object_id) if not drs_object: authz_middleware.mark_authz_done(request) - if authz_enabled() and not view_data_everything: # Don't leak if this object exists + if authz_enabled() and not has_permission_on_everything: # Don't leak if this object exists raise forbidden() raise NotFound("No object found for this ID") # Check permissions ------------------------------------------------- - if view_data_everything: + if has_permission_on_everything: # Good to go already! authz_middleware.mark_authz_done(request) else: @@ -153,9 +154,34 @@ def get_drs_object(object_id: str) -> tuple[DrsBlob | DrsBundle | None, bool]: return None, False -@drs_service.route("/objects/", methods=["GET"]) -@drs_service.route("/ga4gh/drs/v1/objects/", methods=["GET"]) +@drs_service.route("/objects/", methods=["GET", "DELETE"]) +@drs_service.route("/ga4gh/drs/v1/objects/", methods=["GET", "DELETE"]) def object_info(object_id: str): + logger = current_app.logger + + if request.method == "DELETE": + drs_object, is_bundle = fetch_and_check_object_permissions(object_id, P_DELETE_DATA) + + logger.info(f"Deleting object {drs_object.id}") + + if not is_bundle: + q = DrsBundle.query.filter_by(location=drs_object.location).count() + n_using_file = q.count() + if n_using_file == 1 and q.first().id == drs_object.id: + # If this object is the only one using the file, delete the file too + logger.info( + f"Deleting file at {drs_object.location}, since {drs_object.id} is the only object referring to it." + ) + backend = get_backend() + backend.delete(drs_object.location) + + # Don't bother with additional bundle deleting logic, they'll be removed soon anyway. TODO + + drs_object.delete() + db.session.commit() + + return current_app.response_class(status=204) + drs_object, is_bundle = fetch_and_check_object_permissions(object_id, P_QUERY_DATA) # The requester can ask for additional, non-spec-compliant Bento properties to be included in the response From 0e3eebcb4060d7ca8f13af1fede513a64003f6af Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 16 Apr 2024 11:07:27 -0400 Subject: [PATCH 03/11] test: bad backend when getting minio dict --- tests/test_models.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 649d9ba..8d14842 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -12,7 +12,7 @@ def test_drs_blob_init_bad_file(): DrsBlob(location="path/to/dne") -def test_drs_blob_init(): +def test_drs_blob_init_bad_backend(): from chord_drs.app import application from chord_drs.models import DrsBlob @@ -27,3 +27,14 @@ def test_drs_blob_init(): def test_minio_method_wrong_backend(client_local, drs_object): assert drs_object.return_minio_object() is None + + +def test_minio_method_wrong_backend_2(client_minio, drs_object_minio): + from flask import g + from chord_drs.app import application + + application.config["SERVICE_DATA_SOURCE"] = "local" + with pytest.raises(Exception) as e: + g.backend = None # force a backend re-init with local source, mismatching with DRS object + drs_object_minio.return_minio_object() + assert "not properly configured" in str(e) From a1753586d4153a5c462cee3dea5a958cec0cd430 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 16 Apr 2024 11:07:45 -0400 Subject: [PATCH 04/11] fix: issues when deleting object --- chord_drs/backends/minio.py | 6 ++++-- chord_drs/models.py | 4 ++-- chord_drs/routes.py | 42 +++++++++++++++++++------------------ 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/chord_drs/backends/minio.py b/chord_drs/backends/minio.py index 4f72dce..0875519 100644 --- a/chord_drs/backends/minio.py +++ b/chord_drs/backends/minio.py @@ -26,8 +26,10 @@ def build_minio_location(obj): return f"s3://{host}/{obj.bucket_name}/{obj.key}" def get_minio_object(self, location: str): - obj = self.bucket.Object(location.split("/")[-1]) - return obj.get() + return self.bucket.Object(location.split("/")[-1]) + + def get_minio_object_dict(self, location: str) -> dict: + return self.get_minio_object(location).get() def save(self, current_location: str, filename: str) -> str: with open(current_location, "rb") as f: diff --git a/chord_drs/models.py b/chord_drs/models.py index 9407290..b33a09b 100644 --- a/chord_drs/models.py +++ b/chord_drs/models.py @@ -128,7 +128,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def return_minio_object(self): + def return_minio_object(self) -> dict: parsed_url = urlparse(self.location) if parsed_url.scheme != "s3": @@ -139,4 +139,4 @@ def return_minio_object(self): if not backend or not isinstance(backend, MinioBackend): raise Exception("The backend for this instance is not properly configured.") - return backend.get_minio_object(self.location) + return backend.get_minio_object_dict(self.location) diff --git a/chord_drs/routes.py b/chord_drs/routes.py index b1dac43..8b0e6d7 100644 --- a/chord_drs/routes.py +++ b/chord_drs/routes.py @@ -1,3 +1,4 @@ +import logging import os import re import tempfile @@ -154,32 +155,33 @@ def get_drs_object(object_id: str) -> tuple[DrsBlob | DrsBundle | None, bool]: return None, False -@drs_service.route("/objects/", methods=["GET", "DELETE"]) -@drs_service.route("/ga4gh/drs/v1/objects/", methods=["GET", "DELETE"]) -def object_info(object_id: str): - logger = current_app.logger +def delete_drs_object(object_id: str, logger: logging.Logger): + drs_object, is_bundle = fetch_and_check_object_permissions(object_id, P_DELETE_DATA) - if request.method == "DELETE": - drs_object, is_bundle = fetch_and_check_object_permissions(object_id, P_DELETE_DATA) + logger.info(f"Deleting object {drs_object.id}") - logger.info(f"Deleting object {drs_object.id}") + if not is_bundle: + q = DrsBlob.query.filter_by(location=drs_object.location) + n_using_file = q.count() + if n_using_file == 1 and q.first().id == drs_object.id: + # If this object is the only one using the file, delete the file too + logger.info( + f"Deleting file at {drs_object.location}, since {drs_object.id} is the only object referring to it." + ) + backend = get_backend() + backend.delete(drs_object.location) - if not is_bundle: - q = DrsBundle.query.filter_by(location=drs_object.location).count() - n_using_file = q.count() - if n_using_file == 1 and q.first().id == drs_object.id: - # If this object is the only one using the file, delete the file too - logger.info( - f"Deleting file at {drs_object.location}, since {drs_object.id} is the only object referring to it." - ) - backend = get_backend() - backend.delete(drs_object.location) + # Don't bother with additional bundle deleting logic, they'll be removed soon anyway. TODO - # Don't bother with additional bundle deleting logic, they'll be removed soon anyway. TODO + db.session.delete(drs_object) + db.session.commit() - drs_object.delete() - db.session.commit() +@drs_service.route("/objects/", methods=["GET", "DELETE"]) +@drs_service.route("/ga4gh/drs/v1/objects/", methods=["GET", "DELETE"]) +def object_info(object_id: str): + if request.method == "DELETE": + delete_drs_object(object_id, current_app.logger) return current_app.response_class(status=204) drs_object, is_bundle = fetch_and_check_object_permissions(object_id, P_QUERY_DATA) From bc06ea517b0246e2079ce28ac33ff794ba23ab7b Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 16 Apr 2024 11:07:54 -0400 Subject: [PATCH 05/11] test: basic object delete test --- tests/test_routes.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/test_routes.py b/tests/test_routes.py index a05c233..6328f33 100644 --- a/tests/test_routes.py +++ b/tests/test_routes.py @@ -1,7 +1,10 @@ import bento_lib import json +import os.path import pytest import responses +import tempfile +import uuid from flask import current_app from jsonschema import validate @@ -92,16 +95,24 @@ def authz_drs_specific_obj(iters=1): @responses.activate def test_object_fail(client): authz_everything_true() + res = client.get(f"/objects/{NON_EXISTENT_ID}") assert res.status_code == 404 + res = client.delete(f"/objects/{NON_EXISTENT_ID}") + assert res.status_code == 404 + @responses.activate def test_object_fail_forbidden(client): authz_everything_false() + res = client.get(f"/objects/{NON_EXISTENT_ID}") # can't know if this exists since we don't have access assert res.status_code == 403 + res = client.delete(f"/objects/{NON_EXISTENT_ID}") + assert res.status_code == 403 + @responses.activate def test_object_download_fail(client): @@ -250,6 +261,29 @@ def test_object_with_disabled_internal_path(client, drs_object): validate_object_fields(data, with_internal_path=False) +@responses.activate +def test_object_delete(client): + authz_everything_true() + + contents = str(uuid.uuid4()) + + # first, ingest a new object for us to test deleting with + with tempfile.NamedTemporaryFile(mode="w") as tf: + tf.write(contents) # random content, so checksum is unique + tf.flush() + res = client.post("/ingest", data={"path": tf.name}) + + ingested_obj = res.get_json() + + res = client.delete(f"/objects/{ingested_obj['id']}") + assert res.status_code == 204 + + # deleted, so if we try again it should be a 404 + + res = client.delete(f"/objects/{ingested_obj['id']}") + assert res.status_code == 404 + + @responses.activate def test_bundle_and_download(client, drs_bundle): authz_everything_true() From 18ec9e939a1d668946f733ecdfe9055d53ea6e10 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 16 Apr 2024 13:05:40 -0400 Subject: [PATCH 06/11] test: object deletion; use real local backend for tests --- chord_drs/backends/base.py | 14 +--------- tests/conftest.py | 19 +++++++++----- tests/test_routes.py | 52 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 20 deletions(-) diff --git a/chord_drs/backends/base.py b/chord_drs/backends/base.py index 31ee67b..61a2af4 100644 --- a/chord_drs/backends/base.py +++ b/chord_drs/backends/base.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod -__all__ = ["Backend", "FakeBackend"] +__all__ = ["Backend"] class Backend(ABC): @@ -12,15 +12,3 @@ def save(self, current_location: str, filename: str) -> str: # pragma: no cover @abstractmethod def delete(self, location: str) -> None: # pragma: no cover pass - - -class FakeBackend(Backend): - """ - For the tests - """ - - def save(self, current_location: str, filename: str) -> str: - return current_location - - def delete(self, location: str): - return None diff --git a/tests/conftest.py b/tests/conftest.py index c53f60c..3b95de6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,13 +2,14 @@ import os import pathlib import pytest +import shutil from flask import g +from flask.testing import FlaskClient from moto import mock_s3 from pytest_lazyfixture import lazy_fixture # Must only be imports that don't import authz/app/config/db -from chord_drs.backends.base import FakeBackend from chord_drs.backends.minio import MinioBackend from chord_drs.data_sources import DATA_SOURCE_LOCAL, DATA_SOURCE_MINIO @@ -47,7 +48,7 @@ def empty_file_path(): # Function rather than constant so we can set environ fi @pytest.fixture -def client_minio(): +def client_minio() -> FlaskClient: os.environ["BENTO_AUTHZ_SERVICE_URL"] = AUTHZ_URL from chord_drs.app import application, db @@ -72,9 +73,12 @@ def client_minio(): @pytest.fixture -def client_local(): +def client_local() -> FlaskClient: + local_test_volume = (pathlib.Path(__file__).parent / "data").absolute() + local_test_volume.mkdir(parents=True, exist_ok=True) + os.environ["BENTO_AUTHZ_SERVICE_URL"] = AUTHZ_URL - os.environ["DATA"] = str((pathlib.Path(__file__).parent / "data").absolute()) + os.environ["DATA"] = str(local_test_volume) from chord_drs.app import application, db @@ -82,8 +86,6 @@ def client_local(): application.config["SERVICE_DATA_SOURCE"] = DATA_SOURCE_LOCAL with application.app_context(): - g.backend = FakeBackend() - db.create_all() yield application.test_client() @@ -91,9 +93,12 @@ def client_local(): db.session.remove() db.drop_all() + # clear test volume + shutil.rmtree(local_test_volume) + @pytest.fixture(params=[lazy_fixture("client_minio"), lazy_fixture("client_local")]) -def client(request): +def client(request) -> FlaskClient: return request.param diff --git a/tests/test_routes.py b/tests/test_routes.py index 6328f33..c999c9f 100644 --- a/tests/test_routes.py +++ b/tests/test_routes.py @@ -284,6 +284,58 @@ def test_object_delete(client): assert res.status_code == 404 +@responses.activate +def test_object_multi_delete(client): + from chord_drs.models import DrsBlob + + authz_everything_true() + + contents = str(uuid.uuid4()) + + # first, ingest two new objects with the same contents + with tempfile.NamedTemporaryFile(mode="w") as tf: + tf.write(contents) # random content, so checksum is unique + tf.flush() + + # two different projects to ensure we have two objects pointing to the same resource: + res1 = client.post("/ingest", data={"path": tf.name, "project_id": "project1"}) + assert res1.status_code == 201 + res2 = client.post("/ingest", data={"path": tf.name, "project_id": "project2"}) + assert res2.status_code == 201 + + i1 = res1.get_json() + i2 = res2.get_json() + + assert i1["id"] != i2["id"] + + b1 = DrsBlob.query.filter_by(id=i1["id"]).first() + b2 = DrsBlob.query.filter_by(id=i2["id"]).first() + + assert b1.location == b2.location + + # make sure we can get the bytes of i2 + assert client.get(f"/objects/{i2['id']}/download").status_code == 200 + + # delete i2 + rd2 = client.delete(f"/objects/{i2['id']}") + assert rd2.status_code == 204 + + # make sure we can still get the bytes of i1 + assert client.get(f"/objects/{i1['id']}/download").status_code == 200 + + # check file exists if local + if b1.location.startswith("/"): + assert os.path.exists(b1.location) + + # delete i1 + rd1 = client.delete(f"/objects/{i1['id']}") + assert rd1.status_code == 204 + + # check file doesn't exist if local + if b1.location.startswith("/"): + assert not os.path.exists(b1.location) + + @responses.activate def test_bundle_and_download(client, drs_bundle): authz_everything_true() From f0f1c89edae5a7d5022f5c2dc6764ae4fa6bdc00 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 16 Apr 2024 13:14:17 -0400 Subject: [PATCH 07/11] chore: bump version to 0.17.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 19e9a07..85cefb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "chord-drs" -version = "0.16.1" +version = "0.17.0" description = "An implementation of a data repository system (as per GA4GH's specs) for the Bento platform." authors = ["David Lougheed "] license = "LGPL-3.0" From 1587a495362b71f09eca6921fe730b58d793fc16 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 16 Apr 2024 15:30:44 -0400 Subject: [PATCH 08/11] refact: pass config as injected dict to backend init --- chord_drs/backend.py | 2 +- chord_drs/backends/base.py | 5 +++++ chord_drs/backends/local.py | 7 +++---- chord_drs/backends/minio.py | 20 ++++++++++---------- tests/conftest.py | 2 +- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/chord_drs/backend.py b/chord_drs/backend.py index 809fcca..b268ef7 100644 --- a/chord_drs/backend.py +++ b/chord_drs/backend.py @@ -13,7 +13,7 @@ def _get_backend() -> Backend | None: # Instantiate backend if needed backend_class = DATA_SOURCE_BACKENDS.get(current_app.config["SERVICE_DATA_SOURCE"]) - return backend_class() if backend_class else None + return backend_class(current_app.config) if backend_class else None def get_backend() -> Backend | None: diff --git a/chord_drs/backends/base.py b/chord_drs/backends/base.py index 61a2af4..58b7222 100644 --- a/chord_drs/backends/base.py +++ b/chord_drs/backends/base.py @@ -4,7 +4,12 @@ __all__ = ["Backend"] +# noinspection PyUnusedLocal class Backend(ABC): + @abstractmethod + def __init__(self, config: dict): # pragma: no cover + pass + @abstractmethod def save(self, current_location: str, filename: str) -> str: # pragma: no cover pass diff --git a/chord_drs/backends/local.py b/chord_drs/backends/local.py index 9dabbd8..c57f6e8 100644 --- a/chord_drs/backends/local.py +++ b/chord_drs/backends/local.py @@ -1,5 +1,4 @@ from shutil import copy -from flask import current_app from pathlib import Path from .base import Backend @@ -15,10 +14,10 @@ class LocalBackend(Backend): specified by the DATA var env, the default being in ~/chord_drs_data """ - def __init__(self): - self.base_location = Path(current_app.config["SERVICE_DATA"]) + def __init__(self, config: dict): # config is dict or flask.Config, which is a subclass of dict. + self.base_location = Path(config["SERVICE_DATA"]) # We can use mkdir, since resolve has been called in config.py - self.base_location.mkdir(exist_ok=True) + self.base_location.mkdir(parents=True, exist_ok=True) def save(self, current_location: str | Path, filename: str) -> str: new_location = self.base_location / filename diff --git a/chord_drs/backends/minio.py b/chord_drs/backends/minio.py index 0875519..2ba1c11 100644 --- a/chord_drs/backends/minio.py +++ b/chord_drs/backends/minio.py @@ -1,6 +1,5 @@ import boto3 -from flask import current_app from urllib.parse import urlparse from .base import Backend @@ -10,19 +9,20 @@ class MinioBackend(Backend): - def __init__(self, resource=None): + def __init__(self, config: dict, resource=None): # config is dict or flask.Config, which is a subclass of dict. + self._minio_url = config["MINIO_URL"] + self.minio = resource or boto3.resource( "s3", - endpoint_url=current_app.config["MINIO_URL"], - aws_access_key_id=current_app.config["MINIO_USERNAME"], - aws_secret_access_key=current_app.config["MINIO_PASSWORD"], + endpoint_url=self._minio_url, + aws_access_key_id=config["MINIO_USERNAME"], + aws_secret_access_key=config["MINIO_PASSWORD"], ) - self.bucket = self.minio.Bucket(current_app.config["MINIO_BUCKET"]) + self.bucket = self.minio.Bucket(config["MINIO_BUCKET"]) - @staticmethod - def build_minio_location(obj): - host = urlparse(current_app.config["MINIO_URL"]).netloc + def build_minio_location(self, obj): + host = urlparse(self._minio_url).netloc return f"s3://{host}/{obj.bucket_name}/{obj.key}" def get_minio_object(self, location: str): @@ -34,7 +34,7 @@ def get_minio_object_dict(self, location: str) -> dict: def save(self, current_location: str, filename: str) -> str: with open(current_location, "rb") as f: obj = self.bucket.put_object(Key=filename, Body=f) - return MinioBackend.build_minio_location(obj) + return self.build_minio_location(obj) def delete(self, location: str) -> None: obj = self.get_minio_object(location) diff --git a/tests/conftest.py b/tests/conftest.py index 3b95de6..a921ec2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -60,7 +60,7 @@ def client_minio() -> FlaskClient: with application.app_context(), mock_s3(): s3 = boto3.resource("s3") - minio_backend = MinioBackend(resource=s3) + minio_backend = MinioBackend(application.config, resource=s3) g.backend = minio_backend s3.create_bucket(Bucket=bucket_name) From 7d4a1d2cb942c3ac021a5799c93c886b0adf92c3 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 16 Apr 2024 15:30:54 -0400 Subject: [PATCH 09/11] chore: note possible delete race condition --- chord_drs/routes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/chord_drs/routes.py b/chord_drs/routes.py index 8b0e6d7..167e6de 100644 --- a/chord_drs/routes.py +++ b/chord_drs/routes.py @@ -165,6 +165,7 @@ def delete_drs_object(object_id: str, logger: logging.Logger): n_using_file = q.count() if n_using_file == 1 and q.first().id == drs_object.id: # If this object is the only one using the file, delete the file too + # TODO: this can create a race condition and leave files undeleted... should we have a cleanup on start? logger.info( f"Deleting file at {drs_object.location}, since {drs_object.id} is the only object referring to it." ) From 41b6985c8135d3a377d0c372d0aaabecdd83d3a6 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 16 Apr 2024 15:31:25 -0400 Subject: [PATCH 10/11] test(backends): exception when deleting outside localbackend dir --- tests/conftest.py | 15 ++++++++++----- tests/test_backends.py | 24 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 5 deletions(-) create mode 100644 tests/test_backends.py diff --git a/tests/conftest.py b/tests/conftest.py index a921ec2..26a0595 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -73,12 +73,20 @@ def client_minio() -> FlaskClient: @pytest.fixture -def client_local() -> FlaskClient: +def local_volume(): local_test_volume = (pathlib.Path(__file__).parent / "data").absolute() local_test_volume.mkdir(parents=True, exist_ok=True) + yield local_test_volume + + # clear test volume + shutil.rmtree(local_test_volume) + + +@pytest.fixture +def client_local(local_volume: pathlib.Path) -> FlaskClient: os.environ["BENTO_AUTHZ_SERVICE_URL"] = AUTHZ_URL - os.environ["DATA"] = str(local_test_volume) + os.environ["DATA"] = str(local_volume) from chord_drs.app import application, db @@ -93,9 +101,6 @@ def client_local() -> FlaskClient: db.session.remove() db.drop_all() - # clear test volume - shutil.rmtree(local_test_volume) - @pytest.fixture(params=[lazy_fixture("client_minio"), lazy_fixture("client_local")]) def client(request) -> FlaskClient: diff --git a/tests/test_backends.py b/tests/test_backends.py new file mode 100644 index 0000000..59b8d4b --- /dev/null +++ b/tests/test_backends.py @@ -0,0 +1,24 @@ +import pathlib +import pytest + +from chord_drs.backends.local import LocalBackend + + +def test_local_backend(local_volume): + backend = LocalBackend({"SERVICE_DATA": str(local_volume)}) + + file_to_ingest = pathlib.Path(__file__).parent / "dummy_file.txt" + + backend.save(file_to_ingest, "dummy_file.txt") + assert (local_volume / "dummy_file.txt").exists() + + backend.delete(local_volume / "dummy_file.txt") + assert not (local_volume / "dummy_file.txt").exists() + + +def test_local_backend_raises(local_volume): + backend = LocalBackend({"SERVICE_DATA": str(local_volume)}) + + with pytest.raises(ValueError): + # before we can even figure out file does not exist, this is not a local volume subpath: + backend.delete("/tmp/does_not_exist.txt") From 082cb14a25d6b0f9802f5f5c47fe610affcd9a4a Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 17 Apr 2024 12:50:22 -0400 Subject: [PATCH 11/11] refact: remove bentio meta dict type --- chord_drs/serialization.py | 18 ++++++++---------- chord_drs/types.py | 5 ----- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/chord_drs/serialization.py b/chord_drs/serialization.py index 5bbac99..d1ce098 100644 --- a/chord_drs/serialization.py +++ b/chord_drs/serialization.py @@ -8,7 +8,7 @@ from .data_sources import DATA_SOURCE_LOCAL, DATA_SOURCE_MINIO from .models import DrsMixin, DrsBlob, DrsBundle -from .types import DRSAccessMethodDict, DRSContentsDict, DRSObjectBentoMetaDict, DRSObjectDict +from .types import DRSAccessMethodDict, DRSContentsDict, DRSObjectBentoDict, DRSObjectDict __all__ = [ @@ -51,14 +51,12 @@ def build_contents(bundle: DrsBundle, expand: bool) -> list[DRSContentsDict]: return content -def build_bento_object_json(drs_object: DrsMixin) -> DRSObjectBentoMetaDict: +def build_bento_object_json(drs_object: DrsMixin) -> DRSObjectBentoDict: return { - "bento": { - "project_id": drs_object.project_id, - "dataset_id": drs_object.dataset_id, - "data_type": drs_object.data_type, - "public": drs_object.public, - } + "project_id": drs_object.project_id, + "dataset_id": drs_object.dataset_id, + "data_type": drs_object.data_type, + "public": drs_object.public, } @@ -82,7 +80,7 @@ def build_bundle_json( **({"description": drs_bundle.description} if drs_bundle.description is not None else {}), "id": drs_bundle.id, "self_uri": create_drs_uri(drs_bundle.id), - **(build_bento_object_json(drs_bundle) if with_bento_properties else {}), + **({"bento": build_bento_object_json(drs_bundle)} if with_bento_properties else {}), } @@ -145,5 +143,5 @@ def build_blob_json( **({"description": drs_blob.description} if drs_blob.description is not None else {}), "id": drs_blob.id, "self_uri": create_drs_uri(drs_blob.id), - **(build_bento_object_json(drs_blob) if with_bento_properties else {}), + **({"bento": build_bento_object_json(drs_blob)} if with_bento_properties else {}), } diff --git a/chord_drs/types.py b/chord_drs/types.py index 46eb25b..391e99b 100644 --- a/chord_drs/types.py +++ b/chord_drs/types.py @@ -6,7 +6,6 @@ "DRSChecksumDict", "DRSContentsDict", "DRSObjectBentoDict", - "DRSObjectBentoMetaDict", "DRSObjectDict", ] @@ -61,10 +60,6 @@ class DRSObjectBentoDict(TypedDict): public: bool -class DRSObjectBentoMetaDict(TypedDict): - bento: DRSObjectBentoDict - - class DRSObjectDict(_DRSObjectDictBase, total=False): access_methods: list[DRSAccessMethodDict] name: str