diff --git a/README.md b/README.md index 3838836..dceb64b 100644 --- a/README.md +++ b/README.md @@ -15,8 +15,7 @@ or inside a MinIO instance (which is a s3-like software). ## TODO / Future considerations - Ingesting is either through the command line or by the endpoint of the same name - (which will create a single object). There is currently no way to ingest an archive - or to package objects into bundles. + (which will create a single object). - Consider how to be aware of http vs https depending on the deployment setup (in singularity, docker, as is). diff --git a/chord_drs/commands.py b/chord_drs/commands.py index f95565e..adfc117 100644 --- a/chord_drs/commands.py +++ b/chord_drs/commands.py @@ -7,46 +7,11 @@ from flask.cli import with_appcontext from .db import db -from .models import DrsBlob, DrsBundle - - -def create_drs_bundle( - location: str, - parent: DrsBundle | None = None, - project_id: str | None = None, - dataset_id: str | None = None, - data_type: str | None = None, - exclude: frozenset[str] = frozenset({}), -) -> DrsBundle: - perms_kwargs = {"project_id": project_id, "dataset_id": dataset_id, "data_type": data_type} - - bundle = DrsBundle(name=os.path.basename(location), **perms_kwargs) - - if parent: - bundle.parent_bundle = parent - - for f in os.listdir(location): - if exclude and f in exclude: - continue - - f = os.path.abspath(os.path.join(location, f)) - - if os.path.isfile(f): - create_drs_blob(f, parent=bundle, **perms_kwargs) - else: - create_drs_bundle(f, parent=bundle, **perms_kwargs) - - bundle.update_checksum_and_size() - db.session.add(bundle) - - current_app.logger.info(f"Created a new bundle, name: {bundle.name}, ID : {bundle.id}, size: {bundle.size}") - - return bundle +from .models import DrsBlob def create_drs_blob( location: str, - parent: DrsBundle | None = None, project_id: str | None = None, dataset_id: str | None = None, data_type: str | None = None, @@ -58,9 +23,6 @@ def create_drs_blob( data_type=data_type, ) - if parent: - drs_blob.bundle = parent - db.session.add(drs_blob) current_app.logger.info(f"Created a new blob, filename: {drs_blob.location} ID : {drs_blob.id}") @@ -82,18 +44,16 @@ def ingest(source: str, project: str, dataset: str, data_type: str) -> None: current_app.logger.setLevel(logging.INFO) # TODO: ingestion for remote files or archives - # TODO: Create directories in minio when ingesting a bundle if not os.path.exists(source): - raise ClickException("File or directory provided does not exist") + raise ClickException("Path provided does not exist") source = os.path.abspath(source) perms_kwargs = {"project_id": project or None, "dataset_id": dataset or None, "data_type": data_type or None} - if os.path.isfile(source): - create_drs_blob(source, **perms_kwargs) - else: - create_drs_bundle(source, **perms_kwargs) + if not os.path.isfile(source): + raise ClickException("Directories cannot be ingested") + create_drs_blob(source, **perms_kwargs) db.session.commit() diff --git a/chord_drs/migrations/versions/5e982af5cde4_remove_bundles.py b/chord_drs/migrations/versions/5e982af5cde4_remove_bundles.py new file mode 100644 index 0000000..6639388 --- /dev/null +++ b/chord_drs/migrations/versions/5e982af5cde4_remove_bundles.py @@ -0,0 +1,59 @@ +"""remove bundles + +Revision ID: 5e982af5cde4 +Revises: dcd501398d46 +Create Date: 2024-07-17 19:54:03.564099 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "5e982af5cde4" +down_revision = "dcd501398d46" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + + with op.batch_alter_table("drs_object", schema=None) as batch_op: + batch_op.drop_constraint(None, type_="foreignkey") + batch_op.drop_column("bundle_id") + + op.drop_table("drs_bundle") + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + + op.create_table( + "drs_bundle", + sa.Column("id", sa.VARCHAR(), nullable=False), + sa.Column("created", sa.DATETIME(), server_default=sa.text("(CURRENT_TIMESTAMP)"), nullable=True), + sa.Column("checksum", sa.VARCHAR(length=64), nullable=False), + sa.Column("size", sa.INTEGER(), nullable=True), + sa.Column("name", sa.VARCHAR(length=250), nullable=True), + sa.Column("description", sa.VARCHAR(length=1000), nullable=True), + sa.Column("parent_bundle_id", sa.INTEGER(), nullable=True), + sa.Column("project_id", sa.VARCHAR(length=64), nullable=True), + sa.Column("dataset_id", sa.VARCHAR(length=64), nullable=True), + sa.Column("data_type", sa.VARCHAR(length=24), nullable=True), + sa.Column("public", sa.BOOLEAN(), server_default=sa.text("0"), nullable=False), + sa.ForeignKeyConstraint( + ["parent_bundle_id"], + ["drs_bundle.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + + with op.batch_alter_table("drs_object", schema=None) as batch_op: + batch_op.add_column(sa.Column("bundle_id", sa.INTEGER(), nullable=True)) + batch_op.create_foreign_key(None, "drs_bundle", ["bundle_id"], ["id"]) + + # ### end Alembic commands ### diff --git a/chord_drs/models.py b/chord_drs/models.py index b33a09b..c317d05 100644 --- a/chord_drs/models.py +++ b/chord_drs/models.py @@ -17,7 +17,6 @@ "Base", "DrsMixin", "DrsBlob", - "DrsBundle", ] Base = declarative_base() @@ -38,44 +37,10 @@ class DrsMixin: public = Column(Boolean, default=False, nullable=False) # If true, the object is accessible by anyone -class DrsBundle(Base, DrsMixin): - __tablename__ = "drs_bundle" - - id = Column(String, primary_key=True) - parent_bundle_id = Column(Integer, ForeignKey("drs_bundle.id")) - parent_bundle = relationship("DrsBundle", remote_side=[id]) - objects = relationship("DrsBlob", cascade="all, delete-orphan", backref="bundle") - - def __init__(self, *args, **kwargs): - self.id = str(uuid4()) - super().__init__(*args, **kwargs) - - def update_checksum_and_size(self): - # For bundle checksumming logic, see the `checksums` field in - # https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.3.0/docs/#tag/DrsObjectModel - - checksums = [] - total_size = 0 - - for obj in self.objects: - total_size += obj.size - checksums.append(obj.checksum) - - checksums.sort() - concat_checksums = "".join(checksums) - - hash_obj = sha256() - hash_obj.update(concat_checksums.encode()) - - self.checksum = hash_obj.hexdigest() - self.size = total_size - - class DrsBlob(Base, DrsMixin): __tablename__ = "drs_object" id = Column(String, primary_key=True) - bundle_id = Column(Integer, ForeignKey(DrsBundle.id), nullable=True) location = Column(String(500), nullable=False) def __init__(self, *args, **kwargs): diff --git a/chord_drs/routes.py b/chord_drs/routes.py index cc69e95..41cc757 100644 --- a/chord_drs/routes.py +++ b/chord_drs/routes.py @@ -28,8 +28,8 @@ from .backend import get_backend from .constants import BENTO_SERVICE_KIND, SERVICE_NAME, SERVICE_TYPE from .db import db -from .models import DrsBlob, DrsBundle -from .serialization import build_bundle_json, build_blob_json +from .models import DrsBlob +from .serialization import build_blob_json from .utils import drs_file_checksum @@ -65,7 +65,7 @@ def _post_headers_getter(r: Request) -> dict[str, str]: def check_objects_permission( - drs_objs: list[DrsBlob | DrsBundle], permission: Permission, mark_authz_done: bool = False + drs_objs: list[DrsBlob], permission: Permission, mark_authz_done: bool = False ) -> tuple[bool, ...]: if not authz_enabled(): return tuple([True] * len(drs_objs)) # Assume we have permission for everything if authz disabled @@ -87,10 +87,10 @@ def check_objects_permission( ) # now a tuple of length len(drs_objs) of whether we have the permission for each object -def fetch_and_check_object_permissions(object_id: str, permission: Permission) -> tuple[DrsBlob | DrsBundle, bool]: +def fetch_and_check_object_permissions(object_id: str, permission: Permission) -> DrsBlob: has_permission_on_everything = check_everything_permission(permission) - drs_object, is_bundle = get_drs_object(object_id) + drs_object = get_drs_object(object_id) if not drs_object: authz_middleware.mark_authz_done(request) @@ -108,7 +108,7 @@ def fetch_and_check_object_permissions(object_id: str, permission: Permission) - raise forbidden() # ------------------------------------------------------------------- - return drs_object, is_bundle + return drs_object def bad_request_log_mark(err: str) -> BadRequest: @@ -149,35 +149,25 @@ def service_info(): ) -def get_drs_object(object_id: str) -> tuple[DrsBlob | DrsBundle | None, bool]: - if drs_bundle := DrsBundle.query.filter_by(id=object_id).first(): - return drs_bundle, True - - # Only try hitting the database for an object if no bundle was found - if drs_blob := DrsBlob.query.filter_by(id=object_id).first(): - return drs_blob, False - - return None, False +def get_drs_object(object_id: str) -> DrsBlob | None: + return DrsBlob.query.filter_by(id=object_id).first() def delete_drs_object(object_id: str, logger: logging.Logger): - drs_object, is_bundle = fetch_and_check_object_permissions(object_id, P_DELETE_DATA) + drs_object = fetch_and_check_object_permissions(object_id, P_DELETE_DATA) logger.info(f"Deleting object {drs_object.id}") - if not is_bundle: - q = DrsBlob.query.filter_by(location=drs_object.location) - n_using_file = q.count() - if n_using_file == 1 and q.first().id == drs_object.id: - # If this object is the only one using the file, delete the file too - # TODO: this can create a race condition and leave files undeleted... should we have a cleanup on start? - logger.info( - f"Deleting file at {drs_object.location}, since {drs_object.id} is the only object referring to it." - ) - backend = get_backend() - backend.delete(drs_object.location) - - # Don't bother with additional bundle deleting logic, they'll be removed soon anyway. TODO + q = DrsBlob.query.filter_by(location=drs_object.location) + n_using_file = q.count() + if n_using_file == 1 and q.first().id == drs_object.id: + # If this object is the only one using the file, delete the file too + # TODO: this can create a race condition and leave files undeleted... should we have a cleanup on start? + logger.info( + f"Deleting file at {drs_object.location}, since {drs_object.id} is the only object referring to it." + ) + backend = get_backend() + backend.delete(drs_object.location) db.session.delete(drs_object) db.session.commit() @@ -190,15 +180,11 @@ def object_info(object_id: str): delete_drs_object(object_id, current_app.logger) return current_app.response_class(status=204) - drs_object, is_bundle = fetch_and_check_object_permissions(object_id, P_QUERY_DATA) + drs_object = fetch_and_check_object_permissions(object_id, P_QUERY_DATA) # The requester can ask for additional, non-spec-compliant Bento properties to be included in the response with_bento_properties: bool = str_to_bool(request.args.get("with_bento_properties", "")) - if is_bundle: - expand: bool = str_to_bool(request.args.get("expand", "")) - return jsonify(build_bundle_json(drs_object, expand=expand, with_bento_properties=with_bento_properties)) - # The requester can specify object internal path to be added to the response use_internal_path: bool = str_to_bool(request.args.get("internal_path", "")) @@ -222,8 +208,6 @@ def object_access(object_id: str, access_id: str): @drs_service.route("/search", methods=["GET"]) def object_search(): - # TODO: Enable search for bundles too - response = [] name: str | None = request.args.get("name") @@ -262,12 +246,7 @@ def object_search(): def object_download(object_id: str): logger = current_app.logger - # TODO: Bundle download - - drs_object, is_bundle = fetch_and_check_object_permissions(object_id, P_DOWNLOAD_DATA) - - if is_bundle: - raise BadRequest("Bundle download is currently unsupported") + drs_object = fetch_and_check_object_permissions(object_id, P_DOWNLOAD_DATA) obj_name = drs_object.name minio_obj = drs_object.return_minio_object() @@ -336,9 +315,6 @@ def generate_bytes(): @drs_service.route("/ingest", methods=["POST"]) def object_ingest(): - # TODO: Enable specifying a parent bundle - # TODO: If a parent is specified, make sure we have permissions to ingest into it? How to reconcile? - logger = current_app.logger data = request.form or {} diff --git a/chord_drs/serialization.py b/chord_drs/serialization.py index d1ce098..467a49a 100644 --- a/chord_drs/serialization.py +++ b/chord_drs/serialization.py @@ -7,12 +7,11 @@ from urllib.parse import urlparse from .data_sources import DATA_SOURCE_LOCAL, DATA_SOURCE_MINIO -from .models import DrsMixin, DrsBlob, DrsBundle -from .types import DRSAccessMethodDict, DRSContentsDict, DRSObjectBentoDict, DRSObjectDict +from .models import DrsMixin, DrsBlob +from .types import DRSAccessMethodDict, DRSObjectBentoDict, DRSObjectDict __all__ = [ - "build_bundle_json", "build_blob_json", ] @@ -25,32 +24,6 @@ def create_drs_uri(object_id: str) -> str: return f"drs://{get_drs_host()}/{object_id}" -def build_contents(bundle: DrsBundle, expand: bool) -> list[DRSContentsDict]: - content: list[DRSContentsDict] = [] - bundles = DrsBundle.query.filter_by(parent_bundle=bundle).all() - - for b in bundles: - content.append( - { - **({"contents": build_contents(b, expand)} if expand else {}), - "drs_uri": create_drs_uri(b.id), - "id": b.id, - "name": b.name, # TODO: Can overwrite... see spec - } - ) - - for c in bundle.objects: - content.append( - { - "drs_uri": create_drs_uri(c.id), - "id": c.id, - "name": c.name, # TODO: Can overwrite... see spec - } - ) - - return content - - def build_bento_object_json(drs_object: DrsMixin) -> DRSObjectBentoDict: return { "project_id": drs_object.project_id, @@ -60,30 +33,6 @@ def build_bento_object_json(drs_object: DrsMixin) -> DRSObjectBentoDict: } -def build_bundle_json( - drs_bundle: DrsBundle, - expand: bool = False, - with_bento_properties: bool = False, -) -> DRSObjectDict: - return { - "contents": build_contents(drs_bundle, expand), - "checksums": [ - { - "checksum": drs_bundle.checksum, - "type": "sha-256", - }, - ], - "created_time": f"{drs_bundle.created.isoformat('T')}Z", - "size": drs_bundle.size, - "name": drs_bundle.name, - # Description should be excluded if null in the database - **({"description": drs_bundle.description} if drs_bundle.description is not None else {}), - "id": drs_bundle.id, - "self_uri": create_drs_uri(drs_bundle.id), - **({"bento": build_bento_object_json(drs_bundle)} if with_bento_properties else {}), - } - - def build_blob_json( drs_blob: DrsBlob, inside_container: bool = False, diff --git a/chord_drs/types.py b/chord_drs/types.py index 391e99b..d16b510 100644 --- a/chord_drs/types.py +++ b/chord_drs/types.py @@ -4,7 +4,6 @@ "DRSAccessURLDict", "DRSAccessMethodDict", "DRSChecksumDict", - "DRSContentsDict", "DRSObjectBentoDict", "DRSObjectDict", ] @@ -47,12 +46,6 @@ class _DRSContentsDictBase(TypedDict): name: str -class DRSContentsDict(_DRSContentsDictBase, total=False): - id: str - drs_uri: str - contents: list["DRSContentsDict"] - - class DRSObjectBentoDict(TypedDict): project_id: str | None dataset_id: str | None @@ -67,6 +60,5 @@ class DRSObjectDict(_DRSObjectDictBase, total=False): updated_time: str version: str mime_type: str - contents: list[DRSContentsDict] aliases: list[str] bento: DRSObjectBentoDict diff --git a/pyproject.toml b/pyproject.toml index 63585a0..f465fc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "chord-drs" -version = "0.17.1" +version = "0.18.0" description = "An implementation of a data repository system (as per GA4GH's specs) for the Bento platform." authors = ["David Lougheed "] license = "LGPL-3.0" diff --git a/tests/conftest.py b/tests/conftest.py index 26a0595..b748695 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -35,10 +35,10 @@ def dummy_file_path() -> str: # Function rather than constant so we can set env return str(APP_DIR.parent / "tests" / "dummy_file.txt") -def dummy_directory_path() -> str: # Function rather than constant so we can set environ first +def dummy_directory_path() -> pathlib.Path: # Function rather than constant so we can set environ first from chord_drs.config import APP_DIR - return str(APP_DIR / "migrations") + return APP_DIR / "migrations" def empty_file_path(): # Function rather than constant so we can set environ first @@ -128,22 +128,29 @@ def drs_object(): @pytest.fixture -def drs_bundle(): +def drs_multi_object(): os.environ["BENTO_AUTHZ_SERVICE_URL"] = AUTHZ_URL from chord_drs.app import db - from chord_drs.commands import create_drs_bundle + from chord_drs.models import DrsBlob - bundle = create_drs_bundle( - dummy_directory_path(), - project_id=DUMMY_PROJECT_ID, - dataset_id=DUMMY_DATASET_ID, - data_type=DATA_TYPE_PHENOPACKET, - ) + objs = [] + + for f in dummy_directory_path().glob("*"): + if f.is_file(): + obj = DrsBlob( + location=str(f), + project_id=DUMMY_PROJECT_ID, + dataset_id=DUMMY_DATASET_ID, + data_type=DATA_TYPE_PHENOPACKET, + ) + + db.session.add(obj) + objs.append(obj) db.session.commit() - yield bundle + return objs @pytest.fixture @@ -164,24 +171,3 @@ def drs_object_minio(): db.session.commit() yield drs_object - - -@pytest.fixture -def drs_bundle_minio(): - os.environ["BENTO_AUTHZ_SERVICE_URL"] = AUTHZ_URL - - from chord_drs.app import db - from chord_drs.commands import create_drs_bundle - - with mock_s3(): - bundle = create_drs_bundle( - dummy_directory_path(), - project_id=DUMMY_PROJECT_ID, - dataset_id=DUMMY_DATASET_ID, - data_type=DATA_TYPE_PHENOPACKET, - exclude=frozenset({"versions", "__pycache__"}), - ) - - db.session.commit() - - yield bundle diff --git a/tests/test_commands.py b/tests/test_commands.py index e2a6baa..896f2bc 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -1,10 +1,9 @@ from click.testing import CliRunner from chord_drs.commands import ingest -from chord_drs.models import DrsBlob, DrsBundle +from chord_drs.models import DrsBlob from tests.conftest import ( non_existant_dummy_file_path, dummy_file_path, - dummy_directory_path, ) @@ -18,7 +17,6 @@ def test_ingest_fail(client_local): def test_ingest(client_local): dummy_file = dummy_file_path() - dummy_dir = dummy_directory_path() runner = CliRunner() result = runner.invoke(ingest, [dummy_file]) @@ -29,11 +27,3 @@ def test_ingest(client_local): assert result.exit_code == 0 assert obj.name == filename assert obj.location - - result = runner.invoke(ingest, [dummy_dir]) - - filename = dummy_dir.split("/")[-1] - bundle = DrsBundle.query.filter_by(name=filename).first() - - assert result.exit_code == 0 - assert len(bundle.objects) > 0 diff --git a/tests/test_routes.py b/tests/test_routes.py index c999c9f..3cef002 100644 --- a/tests/test_routes.py +++ b/tests/test_routes.py @@ -337,45 +337,7 @@ def test_object_multi_delete(client): @responses.activate -def test_bundle_and_download(client, drs_bundle): - authz_everything_true() - - res = client.get(f"/objects/{drs_bundle.id}") - data = res.get_json() - - assert res.status_code == 200 - assert "access_methods" not in data # TODO: there should be access_methods for bundles... although it is spec-opt. - # issue again with the number of files ingested when ran locally vs travis-ci - assert "contents" in data and len(data["contents"]) > 0 - assert "name" in data and data["name"] == drs_bundle.name - - assert "checksums" in data and len("checksums") > 0 - - assert "created_time" in data - assert "size" in data - assert "id" in data and data["id"] == drs_bundle.id - - # jsonify sort alphabetically - makes it that the last element will be - # an object and not a bundle - obj = data["contents"][-1] - - # Fetch nested object record by ID - res = client.get(f"/objects/{obj['id']}") - assert res.status_code == 200 - nested_obj = res.get_json() - - # Fetch nested object bytes - res = client.get(nested_obj["access_methods"][0]["access_url"]["url"]) - assert res.status_code == 200 - assert res.content_length == nested_obj["size"] - - # Bundle download is currently unimplemented - res = client.get(f"/objects/{drs_bundle.id}/download") - assert res.status_code == 400 - - -@responses.activate -def test_search_bad_query(client, drs_bundle): +def test_search_bad_query(client, drs_multi_object): authz_everything_true() res = client.get("/search") @@ -390,8 +352,8 @@ def test_search_bad_query(client, drs_bundle): "/search?fuzzy_name=asd", ), ) -def test_search_object_empty(client, drs_bundle, url): - authz_everything_true(count=len(drs_bundle.objects)) +def test_search_object_empty(client, drs_multi_object, url): + authz_everything_true(count=len(drs_multi_object)) res = client.get(url) data = res.get_json() @@ -412,8 +374,8 @@ def test_search_object_empty(client, drs_bundle, url): "/search?q=alembic.ini&internal_path=1", ), ) -def test_search_object(client, drs_bundle, url): - authz_everything_true(count=len(drs_bundle.objects)) # TODO: + 1 once we can search bundles +def test_search_object(client, drs_multi_object, url): + authz_everything_true(count=len(drs_multi_object)) res = client.get(url) data = res.get_json() @@ -426,8 +388,8 @@ def test_search_object(client, drs_bundle, url): @responses.activate -def test_search_no_permissions(client, drs_bundle): - authz_everything_false(count=len(drs_bundle.objects)) +def test_search_no_permissions(client, drs_multi_object): + authz_everything_false(count=len(drs_multi_object)) res = client.get("/search?name=alembic.ini") data = res.get_json()