Skip to content

Commit

Permalink
Merge pull request #135 from bento-platform/feat/mime-type
Browse files Browse the repository at this point in the history
feat: add optional mime_type property to DRS object
  • Loading branch information
davidlougheed authored Nov 8, 2024
2 parents 987ac01 + 402f293 commit 3018bcd
Show file tree
Hide file tree
Showing 9 changed files with 992 additions and 741 deletions.
15 changes: 15 additions & 0 deletions chord_drs/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from bento_lib.service_info.helpers import build_service_type

__all__ = [
Expand All @@ -6,10 +7,24 @@
"SERVICE_ARTIFACT",
"DRS_SPEC_VERSION",
"SERVICE_TYPE",
"RE_INGESTABLE_MIME_TYPE",
"MIME_OCTET_STREAM",
]

BENTO_SERVICE_KIND = "drs"
SERVICE_NAME = "Bento Data Repository Service"
SERVICE_ARTIFACT = BENTO_SERVICE_KIND
DRS_SPEC_VERSION = "1.4.0" # update to match whatever version of the DRS spec is implemented.
SERVICE_TYPE = build_service_type("org.ga4gh", SERVICE_ARTIFACT, DRS_SPEC_VERSION)

# See https://datatracker.ietf.org/doc/html/rfc2045#section-5.1
# and https://datatracker.ietf.org/doc/html/rfc6838#section-4.2
# - only allow discrete-type content types
# - allow parameters specifying encoding and whatnot
# - for a list of currently assigned MIME types, see https://www.iana.org/assignments/media-types/media-types.xhtml
RE_INGESTABLE_MIME_TYPE = re.compile(
r"^(application|audio|font|image|model|text|video)"
r"/[a-zA-Z0-9][a-zA-Z0-9!#$&^_\-+.]*"
r"(;\s?[a-zA-Z0-9\-_.]+=\"?[a-zA-Z0-9\-_./+ ]*\"?)?$"
)
MIME_OCTET_STREAM = "application/octet-stream"
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""add object mime_type field
Revision ID: 4b4aaba9e448
Revises: 5e982af5cde4
Create Date: 2024-10-28 16:27:17.598861
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '4b4aaba9e448'
down_revision = '5e982af5cde4'
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('drs_object', schema=None) as batch_op:
batch_op.add_column(sa.Column('mime_type', sa.String(length=128), nullable=True))

# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('drs_object', schema=None) as batch_op:
batch_op.drop_column('mime_type')

# ### end Alembic commands ###
11 changes: 11 additions & 0 deletions chord_drs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from .backend import get_backend
from .backends.minio import MinioBackend
from .constants import RE_INGESTABLE_MIME_TYPE
from .utils import drs_file_checksum

__all__ = [
Expand All @@ -34,6 +35,8 @@ class DrsBlob(Base):
name = Column(String(250), nullable=True)
description = Column(String(1000), nullable=True)

mime_type = Column(String(128), nullable=True) # if null, MIME type has not been set / isn't known

# Permissions/Bento-specific project & dataset tagging for DRS items
# TODO: Make some of these not nullable in the future:
project_id = Column(String(64), nullable=True) # Nullable for backwards-compatibility
Expand All @@ -57,6 +60,7 @@ def __init__(self, *args, **kwargs):
self.location = object_to_copy.location
self.size = object_to_copy.size
self.checksum = object_to_copy.checksum
self.mime_type = object_to_copy.mime_type
del kwargs["object_to_copy"]
else:
location = kwargs.get("location")
Expand All @@ -70,6 +74,13 @@ def __init__(self, *args, **kwargs):
self.name = secure_filename(filename or p.name)
new_filename = f"{self.id[:12]}-{self.name}" # TODO: use checksum for filename instead

# MIME type, if set, must be a valid ingestable mime type (not a made up supertype and not, e.g.,
# multipart/form-data.
mime_type: str | None = kwargs.get("mime_type")
if mime_type is not None and not RE_INGESTABLE_MIME_TYPE.match(mime_type):
raise ValueError("Invalid MIME type")
self.mime_type = mime_type

backend = get_backend()

if not backend:
Expand Down
20 changes: 14 additions & 6 deletions chord_drs/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,14 @@
from . import __version__
from .authz import authz_middleware
from .backend import get_backend
from .constants import BENTO_SERVICE_KIND, SERVICE_NAME, SERVICE_TYPE
from .constants import BENTO_SERVICE_KIND, SERVICE_NAME, SERVICE_TYPE, MIME_OCTET_STREAM
from .db import db
from .models import DrsBlob
from .serialization import build_blob_json
from .utils import drs_file_checksum


RE_STARTING_SLASH = re.compile(r"^/")
MIME_OCTET_STREAM = "application/octet-stream"
CHUNK_SIZE = 1024 * 128 # Read 128 KB at a time

drs_service = Blueprint("drs_service", __name__)
Expand Down Expand Up @@ -254,13 +253,17 @@ def object_download(object_id: str):
obj_name = drs_object.name
minio_obj = drs_object.return_minio_object()

# DRS objects have a nullable mime_type in the database. If mime_type is None, serve the object as a generic
# application/octet-stream.
mime_type: str = drs_object.mime_type or MIME_OCTET_STREAM

if not minio_obj:
# Check for "Range" HTTP header
range_header = request.headers.get("Range") # supports "headers={'Range': 'bytes=x-y'}"

if range_header is None:
# Early return, no range header so send the whole thing
res = make_response(send_file(drs_object.location, mimetype=MIME_OCTET_STREAM, download_name=obj_name))
res = make_response(send_file(drs_object.location, mimetype=mime_type, download_name=obj_name))
res.headers["Accept-Ranges"] = "bytes"
return res

Expand Down Expand Up @@ -298,8 +301,8 @@ def generate_bytes():
break

# Stream the bytes of the file or file segment from the generator function
r = current_app.response_class(generate_bytes(), status=206, mimetype=MIME_OCTET_STREAM)
r.headers["Content-Length"] = end + 1 - start # byte range is inclusive, so need to add one
r = current_app.response_class(generate_bytes(), status=206, mimetype=mime_type)
r.headers["Content-Length"] = str(end + 1 - start) # byte range is inclusive, so need to add one
r.headers["Content-Range"] = f"bytes {start}-{end}/{obj_size}"
r.headers["Content-Disposition"] = (
f"attachment; filename*=UTF-8'{urllib.parse.quote(obj_name, encoding='utf-8')}'"
Expand All @@ -309,7 +312,7 @@ def generate_bytes():
# TODO: Support range headers for MinIO objects - only the local backend supports it for now
# TODO: kinda greasy, not really sure we want to support such a feature later on
response = make_response(
send_file(minio_obj["Body"], mimetype="application/octet-stream", as_attachment=True, download_name=obj_name)
send_file(minio_obj["Body"], mimetype=mime_type, as_attachment=True, download_name=obj_name)
)

response.headers["Content-Length"] = minio_obj["ContentLength"]
Expand All @@ -327,7 +330,9 @@ def object_ingest():
dataset_id: str | None = data.get("dataset_id") or None # "
data_type: str | None = data.get("data_type") or None # "
public: bool = data.get("public", "false").strip().lower() == "true"

file = request.files.get("file")
mime_type: str | None = data.get("mime_type") or None # replace blank strings with None

logger.info(f"Received ingest request metadata: {data}")

Expand Down Expand Up @@ -409,6 +414,7 @@ def object_ingest():
drs_object = DrsBlob(
**(dict(object_to_copy=object_to_copy) if object_to_copy else dict(location=obj_path)),
filename=filename,
mime_type=mime_type,
project_id=project_id,
dataset_id=dataset_id,
data_type=data_type,
Expand All @@ -417,6 +423,8 @@ def object_ingest():
db.session.add(drs_object)
db.session.commit()
logger.info(f"Added DRS object: {drs_object}")
except ValueError as e:
raise bad_request_log_mark(str(e))
except Exception as e: # TODO: More specific handling
authz_middleware.mark_authz_done(request)
logger.error(f"Encountered exception during ingest: {e}")
Expand Down
2 changes: 2 additions & 0 deletions chord_drs/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ def build_blob_json(
"name": drs_blob.name,
# Description should be excluded if null in the database
**({"description": drs_blob.description} if drs_blob.description is not None else {}),
# MIME type should be excluded if null in the database
**({"mime_type": drs_blob.mime_type} if drs_blob.mime_type is not None else {}),
"id": drs_blob.id,
"self_uri": create_drs_uri(drs_blob.id),
**({"bento": build_bento_object_json(drs_blob)} if with_bento_properties else {}),
Expand Down
Loading

0 comments on commit 3018bcd

Please sign in to comment.