Skip to content

Commit

Permalink
Merge pull request #496 from bento-platform/refact/discovery-and-over…
Browse files Browse the repository at this point in the history
…views

refact!: discovery module + revised overview endpoints
  • Loading branch information
davidlougheed authored Apr 12, 2024
2 parents a1edcb6 + 3a0f715 commit fa5ef34
Show file tree
Hide file tree
Showing 32 changed files with 2,099 additions and 1,611 deletions.
3 changes: 2 additions & 1 deletion chord_metadata_service/chord/ingest/phenopackets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from chord_metadata_service.chord.models import Project, ProjectJsonSchema, Dataset
from chord_metadata_service.phenopackets import models as pm
from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA, VRS_REF_REGISTRY
from chord_metadata_service.phenopackets.utils import time_element_to_years
from chord_metadata_service.patients.values import KaryotypicSex
from chord_metadata_service.restapi.schema_utils import patch_project_schemas
from chord_metadata_service.restapi.types import ExtensionSchemaDict
from chord_metadata_service.restapi.utils import remove_computed_properties, time_element_to_years
from chord_metadata_service.restapi.utils import remove_computed_properties

from .exceptions import IngestError
from .resources import ingest_resource
Expand Down
75 changes: 28 additions & 47 deletions chord_metadata_service/chord/views_search.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import asyncio
import itertools
import json
import logging

from adrf.decorators import api_view as async_api_view
from bento_lib.responses import errors
from bento_lib.search import build_search_response, postgres

Expand All @@ -22,20 +24,18 @@
from chord_metadata_service.chord.permissions import OverrideOrSuperUserOnly, ReadOnly

from chord_metadata_service.logger import logger
from chord_metadata_service.restapi.utils import get_field_bins, queryset_stats_for_field

from chord_metadata_service.experiments.api_views import EXPERIMENT_SELECT_REL, EXPERIMENT_PREFETCH
from chord_metadata_service.experiments.models import Experiment
from chord_metadata_service.experiments.serializers import ExperimentSerializer

from chord_metadata_service.experiments.summaries import dt_experiment_summary

from chord_metadata_service.metadata.elastic import es

from chord_metadata_service.patients.models import Individual

from chord_metadata_service.phenopackets.api_views import PHENOPACKET_SELECT_REL, PHENOPACKET_PREFETCH
from chord_metadata_service.phenopackets.models import Phenopacket, Biosample
from chord_metadata_service.phenopackets.serializers import PhenopacketSerializer
from chord_metadata_service.phenopackets.summaries import dt_phenopacket_summary

from .data_types import DATA_TYPE_EXPERIMENT, DATA_TYPE_PHENOPACKET, DATA_TYPES
from .models import Dataset
Expand All @@ -47,50 +47,20 @@
OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"


def experiment_dataset_summary(dataset):
experiments = Experiment.objects.filter(dataset=dataset)

return {
"count": experiments.count(),
"data_type_specific": {}, # TODO
}
async def experiment_dataset_summary(_request: DrfRequest, dataset):
return await dt_experiment_summary(Experiment.objects.filter(dataset=dataset), low_counts_censored=False)


def phenopacket_dataset_summary(dataset):
phenopacket_qs = Phenopacket.objects.filter(dataset=dataset) # TODO

# Sex related fields stats are precomputed here and post processed later
# to include missing values inferred from the schema
individuals_sex = queryset_stats_for_field(phenopacket_qs, "subject__sex")
individuals_k_sex = queryset_stats_for_field(phenopacket_qs, "subject__karyotypic_sex")

return {
"count": phenopacket_qs.count(),
"data_type_specific": {
"biosamples": {
"count": phenopacket_qs.values("biosamples__id").count(),
"is_control_sample": queryset_stats_for_field(phenopacket_qs, "biosamples__is_control_sample"),
"taxonomy": queryset_stats_for_field(phenopacket_qs, "biosamples__taxonomy__label"),
},
"diseases": queryset_stats_for_field(phenopacket_qs, "diseases__term__label"),
"individuals": {
"count": phenopacket_qs.values("subject__id").count(),
"sex": {k: individuals_sex.get(k, 0) for k in (s[0] for s in Individual.SEX)},
"karyotypic_sex": {k: individuals_k_sex.get(k, 0) for k in (s[0] for s in Individual.KARYOTYPIC_SEX)},
"taxonomy": queryset_stats_for_field(phenopacket_qs, "subject__taxonomy__label"),
"age": get_field_bins(phenopacket_qs, "subject__age_numeric", 10),
},
"phenotypic_features": queryset_stats_for_field(phenopacket_qs, "phenotypic_features__pftype__label"),
}
}
async def phenopacket_dataset_summary(_request: DrfRequest, dataset: Dataset):
return await dt_phenopacket_summary(Phenopacket.objects.filter(dataset=dataset), low_counts_censored=False)


# TODO: CHORD-standardized logging
def debug_log(message): # pragma: no cover
logging.debug(f"[CHORD Metadata {datetime.now()}] [DEBUG] {message}")


def get_field_lookup(field):
def get_field_lookup(field: list[str]) -> str:
"""
Given a field identifier as a schema-like path e.g. ['biosamples', '[item]', 'id'],
returns a Django ORM field lookup string e.g. 'biosamples__id'
Expand All @@ -116,7 +86,7 @@ def get_values_list(queryset, options):
return queryset.values_list(field_lookup, flat=True)


def data_type_results(query, params, key="id"):
def data_type_results(query: sql.SQL, params, key="id"):
with connection.cursor() as cursor:
debug_log(f"Executing SQL:\n {query.as_string(cursor.connection)}")
cursor.execute(query.as_string(cursor.connection), params)
Expand Down Expand Up @@ -569,11 +539,22 @@ def private_dataset_search(request: DrfRequest, dataset_id: str):
return dataset_search(request=request, dataset_id=dataset_id, internal=True)


@api_view(["GET"])
DATASET_DATA_TYPE_SUMMARY_FUNCTIONS = {
DATA_TYPE_PHENOPACKET: phenopacket_dataset_summary,
DATA_TYPE_EXPERIMENT: experiment_dataset_summary,
}


@async_api_view(["GET"])
@permission_classes([OverrideOrSuperUserOnly | ReadOnly])
def dataset_summary(request: DrfRequest, dataset_id: str):
dataset = Dataset.objects.get(identifier=dataset_id)
return Response({
DATA_TYPE_PHENOPACKET: phenopacket_dataset_summary(dataset=dataset),
DATA_TYPE_EXPERIMENT: experiment_dataset_summary(dataset=dataset),
})
async def dataset_summary(request: DrfRequest, dataset_id: str):
# TODO: PERMISSIONS

dataset = await Dataset.objects.aget(identifier=dataset_id)

summaries = await asyncio.gather(
*map(lambda dt: DATASET_DATA_TYPE_SUMMARY_FUNCTIONS[dt](request, dataset),
DATASET_DATA_TYPE_SUMMARY_FUNCTIONS.keys())
)

return Response({dt: s} for dt, s in zip(DATASET_DATA_TYPE_SUMMARY_FUNCTIONS.keys(), summaries))
Empty file.
191 changes: 191 additions & 0 deletions chord_metadata_service/discovery/api_views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import asyncio

from adrf.decorators import api_view
from django.conf import settings
from drf_spectacular.utils import extend_schema, inline_serializer
from rest_framework import serializers, status
from rest_framework.decorators import permission_classes
from rest_framework.permissions import AllowAny
from rest_framework.request import Request as DrfRequest
from rest_framework.response import Response

from .types import BinWithValue
from ..chord import models as cm
from ..logger import logger

from .fields import get_field_options, get_range_stats, get_categorical_stats, get_date_stats
from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL


@extend_schema(
description="Public search fields with their configuration",
responses={
status.HTTP_200_OK: inline_serializer(
name='public_search_fields_response',
fields={'sections': serializers.JSONField()}
),
status.HTTP_404_NOT_FOUND: inline_serializer(
name='public_search_fields_not_configured',
fields={'message': serializers.CharField()},
),
}
)
@api_view(["GET"])
@permission_classes([AllowAny])
async def public_search_fields(_request: DrfRequest):
"""
get:
Return public search fields with their configuration
"""

# TODO: should be project-scoped

config_public = settings.CONFIG_PUBLIC

if not config_public:
return Response(settings.NO_PUBLIC_FIELDS_CONFIGURED, status=status.HTTP_404_NOT_FOUND)

field_conf = config_public["fields"]

# Note: the array is wrapped in a dictionary structure to help with JSON
# processing by some services.

async def _get_field_response(field) -> dict | None:
field_props = field_conf[field]

return {
**field_props,
"id": field,
"options": await get_field_options(field_props, low_counts_censored=True),
}

async def _get_section_response(section) -> dict:
return {
**section,
"fields": await asyncio.gather(*filter(None, map(_get_field_response, section["fields"]))),
}

return Response({
"sections": await asyncio.gather(*map(_get_section_response, config_public["search"])),
})


async def _counts_for_model_name(mn: str) -> tuple[str, int]:
return mn, await PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.all().acount()


@extend_schema(
description="Overview of all public data in the database",
responses={
status.HTTP_200_OK: inline_serializer(
name='public_overview_response',
fields={'datasets': serializers.CharField()}
),
status.HTTP_404_NOT_FOUND: inline_serializer(
name='public_overview_not_available',
fields={'message': serializers.CharField()},
),
}
)
@api_view(["GET"]) # Don't use BentoAllowAny, we want to be more careful of cases here.
@permission_classes([AllowAny])
async def public_overview(_request: DrfRequest):
"""
get:
Overview of all public data in the database
"""

config_public = settings.CONFIG_PUBLIC

if not config_public:
return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=status.HTTP_404_NOT_FOUND)

# TODO: public overviews SHOULD be project-scoped at least.

# Predefined counts
counts = dict(await asyncio.gather(*map(_counts_for_model_name, PUBLIC_MODEL_NAMES_TO_MODEL)))

# Get the rules config - because we used get_config_public_and_field_set_permissions with no arguments, it'll choose
# these values based on if we have access to ALL public fields or not.
rules_config = config_public["rules"]
count_threshold = rules_config["count_threshold"]

# Set counts to 0 if they're under the count threshold, and we don't have full data access permissions for the
# data type corresponding to the model.
for public_model_name in counts:
if 0 < counts[public_model_name] <= count_threshold:
logger.info(f"Public overview: {public_model_name} count is below count threshold")
counts[public_model_name] = 0

response = {
"layout": config_public["overview"],
"fields": {},
"counts": {
"individuals": counts["individual"],
"biosamples": counts["biosample"],
"experiments": counts["experiment"],
},
# TODO: remove these in favour of public_rules endpoint
"max_query_parameters": rules_config["max_query_parameters"],
"count_threshold": count_threshold,
}

# Parse the public config to gather data for each field defined in the overview

fields = [chart["field"] for section in config_public["overview"] for chart in section["charts"]]
field_conf = config_public["fields"]

async def _get_field_response(field_id: str, field_props: dict) -> dict:
stats: list[BinWithValue] | None
if field_props["datatype"] == "string":
stats = await get_categorical_stats(field_props, low_counts_censored=True)
elif field_props["datatype"] == "number":
stats = await get_range_stats(field_props, low_counts_censored=True)
elif field_props["datatype"] == "date":
stats = await get_date_stats(field_props, low_counts_censored=True)
else:
raise NotImplementedError()

return {
**field_props,
"id": field_id,
**({"data": stats} if stats is not None else {}),
}

# Parallel async collection of field responses for public overview
field_responses = await asyncio.gather(*(_get_field_response(field, field_conf[field]) for field in fields))

for field, field_res in zip(fields, field_responses):
response["fields"][field] = field_res

return Response(response)


@api_view(["GET"])
@permission_classes([AllowAny])
async def public_dataset(_request: DrfRequest):
"""
get:
Properties of the datasets
"""

# For now, we don't have any permissions checks for this.
# In the future, we could introduce a view:dataset permission or something.

if not settings.CONFIG_PUBLIC:
return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=status.HTTP_404_NOT_FOUND)

# Datasets provenance metadata
datasets = cm.Dataset.objects.values(
"title", "description", "contact_info",
"dates", "stored_in", "spatial_coverage",
"types", "privacy", "distributions",
"dimensions", "primary_publications", "citations",
"produced_by", "creators", "licenses",
"acknowledges", "keywords", "version", "dats_file",
"extra_properties", "identifier"
)

return Response({
"datasets": datasets
})
42 changes: 42 additions & 0 deletions chord_metadata_service/discovery/censorship.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import sys

from django.conf import settings

__all__ = [
"RULES_NO_PERMISSIONS",
"get_threshold",
"thresholded_count",
"get_max_query_parameters",
]


RULES_NO_PERMISSIONS = {
"max_query_parameters": 0, # default to no query parameters allowed
"count_threshold": sys.maxsize, # default to MAXINT count threshold (i.e., no counts can be seen)
}


def get_threshold(low_counts_censored: bool) -> int:
"""
Gets the maximum count threshold for hiding censored data (i.e., rounding to 0).
"""
if not low_counts_censored:
return 0
if not settings.CONFIG_PUBLIC:
return RULES_NO_PERMISSIONS["count_threshold"]
return settings.CONFIG_PUBLIC["rules"]["count_threshold"]


def thresholded_count(c: int, low_counts_censored: bool) -> int:
return 0 if c <= get_threshold(low_counts_censored) else c


def get_max_query_parameters(low_counts_censored: bool) -> int:
"""
Gets the maximum number of query parameters allowed for censored discovery.
"""
if not low_counts_censored:
return sys.maxsize
if not settings.CONFIG_PUBLIC:
return RULES_NO_PERMISSIONS["max_query_parameters"]
return settings.CONFIG_PUBLIC["rules"]["max_query_parameters"]
Loading

0 comments on commit fa5ef34

Please sign in to comment.